Cleaning script for individuals: corrections and comments.
parent
a4f0d056a0
commit
68fd69dada
12
config.yaml
12
config.yaml
|
@ -662,18 +662,14 @@ ALL_CLEANING_INDIVIDUAL:
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||||
STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
|
|
||||||
COMPUTE: False
|
|
||||||
TYPE: zero # options: zero, mean, median or k-nearest
|
|
||||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
|
||||||
COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN
|
|
||||||
COLS_VAR_THRESHOLD: True
|
|
||||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
|
||||||
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.4 # set to 0 to disable
|
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.4 # set to 0 to disable
|
||||||
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.25 # set to 0 to disable
|
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.25 # set to 0 to disable
|
||||||
|
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||||
|
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
|
|
|
@ -34,39 +34,20 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
features = edy.calculate_empatica_data_yield(features)
|
features = edy.calculate_empatica_data_yield(features)
|
||||||
|
|
||||||
if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns:
|
if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns:
|
||||||
raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} and empatica_data_yield columns.
|
||||||
|
For phone data yield, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||||
|
|
||||||
|
# Drop rows where phone data yield is less then given threshold
|
||||||
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
|
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
# Drop rows where empatica data yield is less then given threshold
|
||||||
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
# ---> imputation ??
|
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
||||||
|
|
||||||
# impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
|
||||||
|
|
||||||
# if True: #impute_phone_features["COMPUTE"]:
|
|
||||||
# if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
|
|
||||||
# raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
|
||||||
|
|
||||||
# phone_cols = [col for col in features if \
|
|
||||||
# col.startswith('phone_applications_foreground_rapids_') or
|
|
||||||
# col.startswith('phone_battery_rapids_') or
|
|
||||||
# col.startswith('phone_calls_rapids_') or
|
|
||||||
# col.startswith('phone_keyboard_rapids_') or
|
|
||||||
# col.startswith('phone_messages_rapids_') or
|
|
||||||
# col.startswith('phone_screen_rapids_') or
|
|
||||||
# col.startswith('phone_wifi_')]
|
|
||||||
|
|
||||||
# mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
|
|
||||||
# features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower())
|
|
||||||
|
|
||||||
# print(features[features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']][phone_cols])
|
|
||||||
|
|
||||||
# (2.2) (optional) DOES ROW CONSIST OF ENOUGH NON-NAN VALUES? Possible some of these examples could still pass previous condition but not this one?
|
|
||||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||||
features.dropna(axis=0, thresh=min_count, inplace=True)
|
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
||||||
|
|
||||||
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
@ -79,7 +60,6 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
features[esm] = esm_cols[esm]
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
# (4) CONTEXTUAL IMPUTATION
|
# (4) CONTEXTUAL IMPUTATION
|
||||||
|
|
||||||
graph_bf_af(features, "contextual_imputation_before")
|
graph_bf_af(features, "contextual_imputation_before")
|
||||||
|
|
||||||
# Impute selected phone features with a high number
|
# Impute selected phone features with a high number
|
||||||
|
|
Loading…
Reference in New Issue