From 3f7ec80c18b1237c49dfdd9a6736cdf518e0b7fb Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 31 Aug 2022 10:18:50 +0000 Subject: [PATCH] Preparation a) phone_calls 0 imputation b) remove rows with NaN target --- .../all_cleaning_individual/straw/main.py | 26 ++++++++++++++++--- src/features/phone_calls/rapids/main.R | 3 +++ src/features/phone_messages/rapids/main.R | 3 +++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 6ba3ba5e..619af4a2 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -1,15 +1,34 @@ import pandas as pd import numpy as np import math, sys +from sklearn.impute import KNNImputer -def straw_cleaning(sensor_data_files, provider): +def straw_cleaning(sensor_data_files, provider, target=None): features = pd.read_csv(sensor_data_files["sensor_data"][0]) + esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target columns + + #Filter all rows that do not have the target column available + # get target from config or function parameter + if target is None: + features = features[features[esm_cols[0]].notna()] + else: + features = features[features['phone_esm_straw_' + target].notna()] + # TODO: reorder the cleaning steps so it makes sense for the analysis # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this # the snakemake rules will also have to come with additional parameter (in rules/features.smk) + # TODO: imputate the rows where the participants have at least 2 rows (2 time segments) - error prevention (has to be tested) + # TODO: because of different imputation logic (e.g., the phone_data_yield parameter for phone features) the imputation has to + # be planned accordingly. Should the phone features first be imputated with 0 and only then general kNN imputation is executed + # i.e., on the rows that are missing when E4 and phone features availability is not synchronized. CHECK phone_data_yield feat. + # A lot of imputation types/levels (1) imputation related to feature's content (2) imputation related to phone / empatica + # structual specifics (3) general imputation which is needed when types of features desynchronization is present (row is not full) + # because of the lack of the availability. Secondly, there's a high importance that features data frame is checked if and NaN + # values still exist. + # Impute selected features event impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"] if impute_phone_features["COMPUTE"]: @@ -39,8 +58,6 @@ def straw_cleaning(sensor_data_files, provider): if provider["DATA_YIELD_RATIO_THRESHOLD"]: features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]] - esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols - # Remove cols if threshold of NaN values is passed features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] @@ -77,7 +94,8 @@ def straw_cleaning(sensor_data_files, provider): def impute(df, method='zero'): def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation - pass + imputer = KNNImputer(n_neighbors=3) + return pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return { # rest of the columns should be imputed with the selected method 'zero': df.fillna(0), diff --git a/src/features/phone_calls/rapids/main.R b/src/features/phone_calls/rapids/main.R index d793f706..7de288bb 100644 --- a/src/features/phone_calls/rapids/main.R +++ b/src/features/phone_calls/rapids/main.R @@ -88,6 +88,9 @@ rapids_features <- function(sensor_data_files, time_segment, provider){ features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features) call_features <- merge(call_features, features, all=TRUE) } + # TODO: why isn't the next line properly replacing na values with 0 call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count") | contains("sumduration") | contains("minduration") | contains("maxduration") | contains("meanduration") | contains("modeduration")), list( ~ replace_na(., 0))) + # TODO: check if NaN values are filled + call_features[is.na(call_features)] = 0 return(call_features) } \ No newline at end of file diff --git a/src/features/phone_messages/rapids/main.R b/src/features/phone_messages/rapids/main.R index b92769fd..65907ae2 100644 --- a/src/features/phone_messages/rapids/main.R +++ b/src/features/phone_messages/rapids/main.R @@ -65,6 +65,9 @@ rapids_features <- function(sensor_data_files, time_segment, provider){ features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features) messages_features <- merge(messages_features, features, all=TRUE) } + # TODO: why isn't the next line properly replacing na values with 0 messages_features <- messages_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0))) + # TODO: check if NaN values are filled + messages_features[is.na(messages_features)] = 0 return(messages_features) } \ No newline at end of file