E4 data yield corrections. Changes in overal cs - standardization.

notes
Primoz 2022-10-05 14:16:55 +00:00
parent 437459648f
commit a34412a18d
3 changed files with 68 additions and 53 deletions

View File

@ -680,7 +680,7 @@ ALL_CLEANING_INDIVIDUAL:
ALL_CLEANING_OVERALL: ALL_CLEANING_OVERALL:
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
IMPUTE_SELECTED_EVENT_FEATURES: IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: False COMPUTE: False
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33

View File

@ -1,10 +1,9 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import math, sys, random import math, sys, random, warnings, yaml
import yaml
from sklearn.impute import KNNImputer from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler, minmax_scale
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
@ -31,7 +30,7 @@ def straw_cleaning(sensor_data_files, provider):
graph_bf_af(features, "2target_rows_after") graph_bf_af(features, "2target_rows_after")
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality # (2) QUALITY CHECK (DATA YIELD COLUMN) drops the rows where E4 or phone data is low quality
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower() phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
@ -43,42 +42,26 @@ def straw_cleaning(sensor_data_files, provider):
hist = features[["empatica_data_yield", phone_data_yield_column]].hist() hist = features[["empatica_data_yield", phone_data_yield_column]].hist()
plt.legend() plt.legend()
plt.savefig(f'phone_E4_histogram.png', bbox_inches='tight') plt.savefig(f'phone_E4_histogram.png', bbox_inches='tight')
# Drop rows where phone data yield is less then given threshold # Drop rows where phone data yield is less then given threshold
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]: if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
print("\nThreshold:", provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]) print("\nThreshold:", provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"])
print("Phone features data yield stats:", features[phone_data_yield_column].describe(), "\n") print("Phone features data yield stats:", features[phone_data_yield_column].describe(), "\n")
print(features[phone_data_yield_column].sort_values()) # print(features[phone_data_yield_column].sort_values())
hist = features[phone_data_yield_column].hist(bins=5) hist = features[phone_data_yield_column].hist(bins=5)
plt.close()
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True) features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
# Drop rows where empatica data yield is less then given threshold # Drop rows where empatica data yield is less then given threshold
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]: if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
print("\nThreshold:", provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]) print("\nThreshold:", provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"])
print("E4 features data yield stats:", features["empatica_data_yield"].describe(), "\n") print("E4 features data yield stats:", features["empatica_data_yield"].describe(), "\n")
print(features["empatica_data_yield"].sort_values()) # print(features["empatica_data_yield"].sort_values())
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True) features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
graph_bf_af(features, "3data_yield_drop_rows") graph_bf_af(features, "3data_yield_drop_rows")
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
graph_bf_af(features, "4too_much_nans_rows") # (3) CONTEXTUAL IMPUTATION
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
graph_bf_af(features, "5too_much_nans_cols")
# Preserve esm cols if deleted (has to come after drop cols operations)
for esm in esm_cols:
if esm not in features:
features[esm] = esm_cols[esm]
# (4) CONTEXTUAL IMPUTATION
# Impute selected phone features with a high number # Impute selected phone features with a high number
impute_w_hn = [col for col in features.columns if \ impute_w_hn = [col for col in features.columns if \
@ -91,7 +74,7 @@ def straw_cleaning(sensor_data_files, provider):
"timelastmessages" in col] "timelastmessages" in col]
features[impute_w_hn] = impute(features[impute_w_hn], method="high_number") features[impute_w_hn] = impute(features[impute_w_hn], method="high_number")
graph_bf_af(features, "6high_number_imp") graph_bf_af(features, "4high_number_imp")
# Impute special case (mostcommonactivity) and (homelabel) # Impute special case (mostcommonactivity) and (homelabel)
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col] impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
@ -112,7 +95,7 @@ def straw_cleaning(sensor_data_files, provider):
col.startswith('phone_wifi_visible')] col.startswith('phone_wifi_visible')]
features[impute_zero] = impute(features[impute_zero], method="zero") features[impute_zero] = impute(features[impute_zero], method="zero")
graph_bf_af(features, "7zero_imp") graph_bf_af(features, "5zero_imp")
# Impute phone locations with median - should this rather be imputed at kNN step?? # Impute phone locations with median - should this rather be imputed at kNN step??
# impute_locations = [col for col in features.columns if "phone_locations_" in col] # impute_locations = [col for col in features.columns if "phone_locations_" in col]
@ -127,30 +110,53 @@ def straw_cleaning(sensor_data_files, provider):
# graph_bf_af(features[impute_locations], "phoneloc_before") # graph_bf_af(features[impute_locations], "phoneloc_before")
# features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations] # features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
# (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
# (5) STANDARDIZATION graph_bf_af(features, "6too_much_nans_cols")
if provider["STANDARDIZATION"]:
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: 0 if (x.std() == 0) else (x - x.mean()) / x.std())
graph_bf_af(features, "8standardization") # (5) REMOVE COLS WHERE VARIANCE IS 0
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"]
features[impute_cols] = impute(features[impute_cols], method="knn")
graph_bf_af(features, "9knn_after")
# (7) REMOVE COLS WHERE VARIANCE IS 0
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
if provider["COLS_VAR_THRESHOLD"]: if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
graph_bf_af(features, "10variance_drop") graph_bf_af(features, "7variance_drop")
# Preserve esm cols if deleted (has to come after drop cols operations)
for esm in esm_cols:
if esm not in features:
features[esm] = esm_cols[esm]
# (6) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
graph_bf_af(features, "8too_much_nans_rows")
# (7) STANDARDIZATION
# I expect to see RuntimeWarnings in this block
if provider["STANDARDIZATION"]:
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: minmax_scale(x.astype(float)))
graph_bf_af(features, "9standardization")
# (8) IMPUTATION: IMPUTE DATA WITH KNN METHOD
features.reset_index(drop=True, inplace=True)
impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"]
features[impute_cols] = impute(features[impute_cols], method="knn")
graph_bf_af(features, "10knn_after")
# (9) DROP HIGHLY CORRELATED FEATURES
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
# (8) DROP HIGHLY CORRELATED FEATURES
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check
@ -163,6 +169,11 @@ def straw_cleaning(sensor_data_files, provider):
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])] to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True)
plt.savefig(f'correlation_matrix.png', bbox_inches='tight')
plt.close()
# TODO: katere značilke se izbrišejo - ali korelirajo kakšni pari E4:PHONE?
features.drop(to_drop, axis=1, inplace=True) features.drop(to_drop, axis=1, inplace=True)
# Preserve esm cols if deleted (has to come after drop cols operations) # Preserve esm cols if deleted (has to come after drop cols operations)
@ -172,10 +183,11 @@ def straw_cleaning(sensor_data_files, provider):
graph_bf_af(features, "11correlation_drop") graph_bf_af(features, "11correlation_drop")
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME # (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
if features.isna().any().any(): if features.isna().any().any():
raise ValueError raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
sys.exit()
return features return features
def impute(df, method='zero'): def impute(df, method='zero'):
@ -192,7 +204,7 @@ def impute(df, method='zero'):
'knn': k_nearest(df) 'knn': k_nearest(df)
}[method] }[method]
def graph_bf_af(features, phase_name, plt_flag=False): def graph_bf_af(features, phase_name, plt_flag=True):
if plt_flag: if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
@ -201,5 +213,5 @@ def graph_bf_af(features, phase_name, plt_flag=False):
print(f"\n-------------{phase_name}-------------") print(f"\n-------------{phase_name}-------------")
print("Rows number:", features.shape[0]) print("Rows number:", features.shape[0])
print("Columns number:", len(features.columns)) print("Columns number:", len(features.columns))
print("NaN values:", features.isna().sum().sum())
print("---------------------------------------------\n") print("---------------------------------------------\n")

View File

@ -10,12 +10,15 @@ def calculate_empatica_data_yield(features):
datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S') datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S')
tseg_duration = (datetime_end - datetime_start).total_seconds() tseg_duration = (datetime_end - datetime_start).total_seconds()
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration \
features["temp_data_yield"] = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration if 'empatica_accelerometer_cr_SO_windowsCount' in features else 0
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration features["temp_data_yield"] = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration \
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration if 'empatica_temperature_cr_SO_windowsCount' in features else 0
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration \
if 'empatica_electrodermal_activity_cr_SO_windowsCount' in features else 0
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration \
if 'empatica_inter_beat_interval_cr_SO_windowsCount' in features else 0
# TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield'] empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0) features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0)
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average) features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)