diff --git a/config.yaml b/config.yaml index bcd4de4e..7a0a6823 100644 --- a/config.yaml +++ b/config.yaml @@ -672,29 +672,29 @@ ALL_CLEANING_INDIVIDUAL: RAPIDS: COMPUTE: True IMPUTE_SELECTED_EVENT_FEATURES: - COMPUTE: True + COMPUTE: False MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 1 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES - DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable + DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable DROP_HIGHLY_CORRELATED_FEATURES: COMPUTE: True MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 CORR_THRESHOLD: 0.95 SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R - STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type + is not considering MIN_OVERLAP_FOR_CORR_THRESHOLD param and does not have special treatment for phone_esm (see RAPIDS script) - COMPUTE: True + STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type + COMPUTE: False IMPUTE_PHONE_SELECTED_EVENT_FEATURES: - COMPUTE: True + COMPUTE: False TYPE: median # options: zero, mean, median or k-nearest MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 0 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES - DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable + DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable DROP_HIGHLY_CORRELATED_FEATURES: COMPUTE: True MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 @@ -707,29 +707,29 @@ ALL_CLEANING_OVERALL: RAPIDS: COMPUTE: False IMPUTE_SELECTED_EVENT_FEATURES: - COMPUTE: True + COMPUTE: False MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 1 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES - DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable + DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable DROP_HIGHLY_CORRELATED_FEATURES: COMPUTE: True MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 CORR_THRESHOLD: 0.95 SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R - STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type + is not considering MIN_OVERLAP_FOR_CORR_THRESHOLD param - COMPUTE: True + STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type + COMPUTE: False IMPUTE_PHONE_SELECTED_EVENT_FEATURES: - COMPUTE: True + COMPUTE: False TYPE: median # options: zero, mean, median or k-nearest MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 0 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES - DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable + DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable DROP_HIGHLY_CORRELATED_FEATURES: COMPUTE: True MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index a9a4574c..f82a355d 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -27,7 +27,7 @@ def straw_cleaning(sensor_data_files, provider): col.startswith('phone_wifi_')] mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE'] - features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"]) + features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower()) # Drop rows with the value of data_yield_column less than data_yield_ratio_threshold data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower() @@ -38,13 +38,20 @@ def straw_cleaning(sensor_data_files, provider): features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]] - # Remove cols if threshold of NaN values is passed - features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] + esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols + # Remove cols if threshold of NaN values is passed + features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] + # Remove cols where variance is 0 if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) + # Preserve esm cols if deleted (has to come after drop cols operations) + for esm in esm_cols: + if esm not in features: + features[esm] = esm_cols[esm] + # Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs? drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] if drop_corr_features["COMPUTE"]: @@ -61,14 +68,14 @@ def straw_cleaning(sensor_data_files, provider): features.drop(to_drop, axis=1, inplace=True) # Remove rows if threshold of NaN values is passed - min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # min not nan values in row + min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row features.dropna(axis=0, thresh=min_count, inplace=True) return features def impute(df, method='zero'): - def k_nearest(df): # TODO: if needed implement k-nearest imputation / interpolation + def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation pass return { # rest of the columns should be imputed with the selected method @@ -77,5 +84,4 @@ def impute(df, method='zero'): 'median': df.fillna(df.median()), 'k-nearest': k_nearest(df) }[method] -