Errors fix: individual script - treat participants missing data.
parent
53f6cc60d5
commit
437459648f
|
@ -14,6 +14,7 @@ from src.features import empatica_data_yield as edy
|
||||||
pd.set_option('display.max_columns', 20)
|
pd.set_option('display.max_columns', 20)
|
||||||
|
|
||||||
def straw_cleaning(sensor_data_files, provider):
|
def straw_cleaning(sensor_data_files, provider):
|
||||||
|
# TODO (maybe): reorganize the script based on the overall
|
||||||
|
|
||||||
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||||
|
|
||||||
|
@ -46,6 +47,9 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return features
|
||||||
|
|
||||||
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
||||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||||
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
||||||
|
@ -90,6 +94,7 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
col.startswith('phone_messages_rapids_') or
|
col.startswith('phone_messages_rapids_') or
|
||||||
col.startswith('phone_screen_rapids_') or
|
col.startswith('phone_screen_rapids_') or
|
||||||
col.startswith('phone_wifi_visible')]
|
col.startswith('phone_wifi_visible')]
|
||||||
|
|
||||||
features[impute_zero] = impute(features[impute_zero], method="zero")
|
features[impute_zero] = impute(features[impute_zero], method="zero")
|
||||||
|
|
||||||
## (5) STANDARDIZATION
|
## (5) STANDARDIZATION
|
||||||
|
@ -98,6 +103,7 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
|
|
||||||
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
|
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
|
||||||
impute_cols = [col for col in features.columns if col not in excluded_columns]
|
impute_cols = [col for col in features.columns if col not in excluded_columns]
|
||||||
|
features.reset_index(drop=True, inplace=True)
|
||||||
features[impute_cols] = impute(features[impute_cols], method="knn")
|
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||||
|
|
||||||
# (7) REMOVE COLS WHERE VARIANCE IS 0
|
# (7) REMOVE COLS WHERE VARIANCE IS 0
|
||||||
|
@ -106,9 +112,11 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
if provider["COLS_VAR_THRESHOLD"]:
|
if provider["COLS_VAR_THRESHOLD"]:
|
||||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||||
|
|
||||||
|
fe5 = features.copy()
|
||||||
|
|
||||||
# (8) DROP HIGHLY CORRELATED FEATURES
|
# (8) DROP HIGHLY CORRELATED FEATURES
|
||||||
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||||
if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check
|
if drop_corr_features["COMPUTE"] and features.shape[0]: # If small amount of segments (rows) is present, do not execute correlation check
|
||||||
|
|
||||||
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||||
|
|
||||||
|
@ -126,15 +134,18 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
if esm not in features:
|
if esm not in features:
|
||||||
features[esm] = esm_cols[esm]
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
|
fe6 = features.copy()
|
||||||
|
|
||||||
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||||
if features.isna().any().any():
|
if features.isna().any().any():
|
||||||
raise ValueError
|
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
def impute(df, method='zero'):
|
def impute(df, method='zero'):
|
||||||
|
|
||||||
def k_nearest(df):
|
def k_nearest(df):
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
imputer = KNNImputer(n_neighbors=3)
|
imputer = KNNImputer(n_neighbors=3)
|
||||||
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue