From 8af4ef11dc711dffd86c71c99b15e9451aa94ecd Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 28 Sep 2022 10:02:47 +0000 Subject: [PATCH] Contextual imputation by feature type. --- config.yaml | 6 +- .../all_cleaning_individual/straw/main.py | 58 ++++++++++++------- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/config.yaml b/config.yaml index 1fe1a2a3..19ad8ad1 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,7 @@ ######################################################################################################################## # See https://www.rapids.science/latest/setup/configuration/#participant-files -PIDS: ['p01', 'p02'] #['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107'] +PIDS: ['p01'] #['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107'] # See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files CREATE_PARTICIPANT_FILES: @@ -159,7 +159,7 @@ PHONE_BLUETOOTH: CONTAINER: bluetooth PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R @@ -668,7 +668,7 @@ ALL_CLEANING_INDIVIDUAL: COMPUTE: False TYPE: zero # options: zero, mean, median or k-nearest MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 - COLS_NAN_THRESHOLD: 0.9 # set to 1 remove only columns that contains all NaN + COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 1 # set to 1 to disable PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 6a1f7402..47e142e5 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -78,29 +78,48 @@ def straw_cleaning(sensor_data_files, provider): if esm not in features: features[esm] = esm_cols[esm] - graph_bf_af(features, "before_knn") + # (4) CONTEXTUAL IMPUTATION - ## (4) STANDARDIZATION + graph_bf_af(features, "contextual_imputation_before") + + # Impute selected phone features with a high number + impute_w_hn = [col for col in features.columns if \ + "timeoffirstuse" in col or + "timeoflastuse" in col or + "timefirstcall" in col or + "timelastcall" in col or + "timefirstmessages" in col or + "timelastmessages" in col or + "firstuseafter" in col] + features[impute_w_hn] = impute(features[impute_w_hn], method="high_number") + + # Impute phone locations with median + impute_locations = [col for col in features.columns if "phone_locations_" in col] + features[impute_locations] = impute(features[impute_locations], method="median") + + # Impute remaining phone features with 0 + impute_rest = [col for col in features.columns if "phone_" in col] + features[impute_locations] = impute(features[impute_locations], method="zero") + + graph_bf_af(features, "contextual_imputation_after") + + ## (5) STANDARDIZATION if provider["STANDARDIZATION"]: features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)]) - # (5) IMPUTATION: IMPUTE DATA WITH KNN METHOD (TODO: for now only kNN) - # - no other input restriction for this method except that rows are full enough and have reasonably high quality as assessed by data yield + # (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD impute_cols = [col for col in features.columns if col not in excluded_columns] features[impute_cols] = impute(features[impute_cols], method="knn") - graph_bf_af(features, "after_knn") - # (6) REMOVE COLS WHERE VARIANCE IS 0 + # (7) REMOVE COLS WHERE VARIANCE IS 0 esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) - graph_bf_af(features, "before_corr") - - # (7) DROP HIGHLY CORRELATED FEATURES + # (8) DROP HIGHLY CORRELATED FEATURES drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] - if drop_corr_features["COMPUTE"] and features.shape[0] >= 3: + if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check numerical_cols = features.select_dtypes(include=np.number).columns.tolist() @@ -113,15 +132,12 @@ def straw_cleaning(sensor_data_files, provider): features.drop(to_drop, axis=1, inplace=True) - graph_bf_af(features, "after_corr") - # Preserve esm cols if deleted (has to come after drop cols operations) for esm in esm_cols: if esm not in features: features[esm] = esm_cols[esm] - - # (8) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME + # (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME if features.isna().any().any(): raise ValueError @@ -129,12 +145,6 @@ def straw_cleaning(sensor_data_files, provider): return features -def graph_bf_af(features, phase_name): - sns.set(rc={"figure.figsize":(16, 8)}) - print(features) - sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) - plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight') - def impute(df, method='zero'): def k_nearest(df): @@ -143,8 +153,16 @@ def impute(df, method='zero'): return { 'zero': df.fillna(0), + 'high_number': df.fillna(1000000), 'mean': df.fillna(df.mean()), 'median': df.fillna(df.median()), 'knn': k_nearest(df) }[method] + +def graph_bf_af(features, phase_name): + sns.set(rc={"figure.figsize":(16, 8)}) + print(features) + sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) + plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight') +