diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 27d7ff80..6ac97cf7 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -14,6 +14,7 @@ from src.features import empatica_data_yield as edy pd.set_option('display.max_columns', 20) def straw_cleaning(sensor_data_files, provider): + # TODO (maybe): reorganize the script based on the overall features = pd.read_csv(sensor_data_files["sensor_data"][0]) @@ -45,6 +46,9 @@ def straw_cleaning(sensor_data_files, provider): # Drop rows where empatica data yield is less then given threshold if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]: features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True) + + if features.empty: + return features # (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES? min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row @@ -90,6 +94,7 @@ def straw_cleaning(sensor_data_files, provider): col.startswith('phone_messages_rapids_') or col.startswith('phone_screen_rapids_') or col.startswith('phone_wifi_visible')] + features[impute_zero] = impute(features[impute_zero], method="zero") ## (5) STANDARDIZATION @@ -98,6 +103,7 @@ def straw_cleaning(sensor_data_files, provider): # (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD impute_cols = [col for col in features.columns if col not in excluded_columns] + features.reset_index(drop=True, inplace=True) features[impute_cols] = impute(features[impute_cols], method="knn") # (7) REMOVE COLS WHERE VARIANCE IS 0 @@ -106,9 +112,11 @@ def straw_cleaning(sensor_data_files, provider): if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) + fe5 = features.copy() + # (8) DROP HIGHLY CORRELATED FEATURES drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] - if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check + if drop_corr_features["COMPUTE"] and features.shape[0]: # If small amount of segments (rows) is present, do not execute correlation check numerical_cols = features.select_dtypes(include=np.number).columns.tolist() @@ -126,15 +134,18 @@ def straw_cleaning(sensor_data_files, provider): if esm not in features: features[esm] = esm_cols[esm] + fe6 = features.copy() + # (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME if features.isna().any().any(): - raise ValueError + raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.") return features def impute(df, method='zero'): def k_nearest(df): + pd.set_option('display.max_columns', None) imputer = KNNImputer(n_neighbors=3) return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)