Preparation a) phone_calls 0 imputation b) remove rows with NaN target
parent
f78aa3e7b3
commit
3f7ec80c18
|
@ -1,15 +1,34 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import math, sys
|
import math, sys
|
||||||
|
from sklearn.impute import KNNImputer
|
||||||
|
|
||||||
def straw_cleaning(sensor_data_files, provider):
|
def straw_cleaning(sensor_data_files, provider, target=None):
|
||||||
|
|
||||||
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||||
|
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target columns
|
||||||
|
|
||||||
|
#Filter all rows that do not have the target column available
|
||||||
|
# get target from config or function parameter
|
||||||
|
if target is None:
|
||||||
|
features = features[features[esm_cols[0]].notna()]
|
||||||
|
else:
|
||||||
|
features = features[features['phone_esm_straw_' + target].notna()]
|
||||||
|
|
||||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||||
|
|
||||||
|
# TODO: imputate the rows where the participants have at least 2 rows (2 time segments) - error prevention (has to be tested)
|
||||||
|
# TODO: because of different imputation logic (e.g., the phone_data_yield parameter for phone features) the imputation has to
|
||||||
|
# be planned accordingly. Should the phone features first be imputated with 0 and only then general kNN imputation is executed
|
||||||
|
# i.e., on the rows that are missing when E4 and phone features availability is not synchronized. CHECK phone_data_yield feat.
|
||||||
|
# A lot of imputation types/levels (1) imputation related to feature's content (2) imputation related to phone / empatica
|
||||||
|
# structual specifics (3) general imputation which is needed when types of features desynchronization is present (row is not full)
|
||||||
|
# because of the lack of the availability. Secondly, there's a high importance that features data frame is checked if and NaN
|
||||||
|
# values still exist.
|
||||||
|
|
||||||
# Impute selected features event
|
# Impute selected features event
|
||||||
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
||||||
if impute_phone_features["COMPUTE"]:
|
if impute_phone_features["COMPUTE"]:
|
||||||
|
@ -39,8 +58,6 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
if provider["DATA_YIELD_RATIO_THRESHOLD"]:
|
if provider["DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
|
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
|
||||||
|
|
||||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols
|
|
||||||
|
|
||||||
# Remove cols if threshold of NaN values is passed
|
# Remove cols if threshold of NaN values is passed
|
||||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||||
|
|
||||||
|
@ -77,7 +94,8 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
def impute(df, method='zero'):
|
def impute(df, method='zero'):
|
||||||
|
|
||||||
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation
|
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation
|
||||||
pass
|
imputer = KNNImputer(n_neighbors=3)
|
||||||
|
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||||
|
|
||||||
return { # rest of the columns should be imputed with the selected method
|
return { # rest of the columns should be imputed with the selected method
|
||||||
'zero': df.fillna(0),
|
'zero': df.fillna(0),
|
||||||
|
|
|
@ -88,6 +88,9 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
||||||
call_features <- merge(call_features, features, all=TRUE)
|
call_features <- merge(call_features, features, all=TRUE)
|
||||||
}
|
}
|
||||||
|
# TODO: why isn't the next line properly replacing na values with 0
|
||||||
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count") | contains("sumduration") | contains("minduration") | contains("maxduration") | contains("meanduration") | contains("modeduration")), list( ~ replace_na(., 0)))
|
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count") | contains("sumduration") | contains("minduration") | contains("maxduration") | contains("meanduration") | contains("modeduration")), list( ~ replace_na(., 0)))
|
||||||
|
# TODO: check if NaN values are filled
|
||||||
|
call_features[is.na(call_features)] = 0
|
||||||
return(call_features)
|
return(call_features)
|
||||||
}
|
}
|
|
@ -65,6 +65,9 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features)
|
features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features)
|
||||||
messages_features <- merge(messages_features, features, all=TRUE)
|
messages_features <- merge(messages_features, features, all=TRUE)
|
||||||
}
|
}
|
||||||
|
# TODO: why isn't the next line properly replacing na values with 0
|
||||||
messages_features <- messages_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
messages_features <- messages_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
||||||
|
# TODO: check if NaN values are filled
|
||||||
|
messages_features[is.na(messages_features)] = 0
|
||||||
return(messages_features)
|
return(messages_features)
|
||||||
}
|
}
|
Loading…
Reference in New Issue