E4 data yield corrections. Changes in overal cs - standardization.

notes
Primoz 2022-10-05 14:16:55 +00:00
parent 437459648f
commit a34412a18d
3 changed files with 68 additions and 53 deletions

View File

@ -680,7 +680,7 @@ ALL_CLEANING_INDIVIDUAL:
ALL_CLEANING_OVERALL:
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: False
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33

View File

@ -1,10 +1,9 @@
import pandas as pd
import numpy as np
import math, sys, random
import yaml
import math, sys, random, warnings, yaml
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, minmax_scale
import matplotlib.pyplot as plt
import seaborn as sns
@ -31,7 +30,7 @@ def straw_cleaning(sensor_data_files, provider):
graph_bf_af(features, "2target_rows_after")
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
# (2) QUALITY CHECK (DATA YIELD COLUMN) drops the rows where E4 or phone data is low quality
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
@ -43,42 +42,26 @@ def straw_cleaning(sensor_data_files, provider):
hist = features[["empatica_data_yield", phone_data_yield_column]].hist()
plt.legend()
plt.savefig(f'phone_E4_histogram.png', bbox_inches='tight')
# Drop rows where phone data yield is less then given threshold
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
print("\nThreshold:", provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"])
print("Phone features data yield stats:", features[phone_data_yield_column].describe(), "\n")
print(features[phone_data_yield_column].sort_values())
# print(features[phone_data_yield_column].sort_values())
hist = features[phone_data_yield_column].hist(bins=5)
plt.close()
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
# Drop rows where empatica data yield is less then given threshold
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
print("\nThreshold:", provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"])
print("E4 features data yield stats:", features["empatica_data_yield"].describe(), "\n")
print(features["empatica_data_yield"].sort_values())
# print(features["empatica_data_yield"].sort_values())
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
graph_bf_af(features, "3data_yield_drop_rows")
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
graph_bf_af(features, "4too_much_nans_rows")
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
graph_bf_af(features, "5too_much_nans_cols")
# Preserve esm cols if deleted (has to come after drop cols operations)
for esm in esm_cols:
if esm not in features:
features[esm] = esm_cols[esm]
# (4) CONTEXTUAL IMPUTATION
# (3) CONTEXTUAL IMPUTATION
# Impute selected phone features with a high number
impute_w_hn = [col for col in features.columns if \
@ -91,7 +74,7 @@ def straw_cleaning(sensor_data_files, provider):
"timelastmessages" in col]
features[impute_w_hn] = impute(features[impute_w_hn], method="high_number")
graph_bf_af(features, "6high_number_imp")
graph_bf_af(features, "4high_number_imp")
# Impute special case (mostcommonactivity) and (homelabel)
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
@ -112,7 +95,7 @@ def straw_cleaning(sensor_data_files, provider):
col.startswith('phone_wifi_visible')]
features[impute_zero] = impute(features[impute_zero], method="zero")
graph_bf_af(features, "7zero_imp")
graph_bf_af(features, "5zero_imp")
# Impute phone locations with median - should this rather be imputed at kNN step??
# impute_locations = [col for col in features.columns if "phone_locations_" in col]
@ -128,29 +111,52 @@ def straw_cleaning(sensor_data_files, provider):
# features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
# (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
# (5) STANDARDIZATION
if provider["STANDARDIZATION"]:
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: 0 if (x.std() == 0) else (x - x.mean()) / x.std())
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
graph_bf_af(features, "8standardization")
graph_bf_af(features, "6too_much_nans_cols")
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"]
features[impute_cols] = impute(features[impute_cols], method="knn")
graph_bf_af(features, "9knn_after")
# (7) REMOVE COLS WHERE VARIANCE IS 0
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
# (5) REMOVE COLS WHERE VARIANCE IS 0
if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
graph_bf_af(features, "10variance_drop")
graph_bf_af(features, "7variance_drop")
# Preserve esm cols if deleted (has to come after drop cols operations)
for esm in esm_cols:
if esm not in features:
features[esm] = esm_cols[esm]
# (6) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
graph_bf_af(features, "8too_much_nans_rows")
# (7) STANDARDIZATION
# I expect to see RuntimeWarnings in this block
if provider["STANDARDIZATION"]:
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: minmax_scale(x.astype(float)))
graph_bf_af(features, "9standardization")
# (8) IMPUTATION: IMPUTE DATA WITH KNN METHOD
features.reset_index(drop=True, inplace=True)
impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"]
features[impute_cols] = impute(features[impute_cols], method="knn")
graph_bf_af(features, "10knn_after")
# (9) DROP HIGHLY CORRELATED FEATURES
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
# (8) DROP HIGHLY CORRELATED FEATURES
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check
@ -163,6 +169,11 @@ def straw_cleaning(sensor_data_files, provider):
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True)
plt.savefig(f'correlation_matrix.png', bbox_inches='tight')
plt.close()
# TODO: katere značilke se izbrišejo - ali korelirajo kakšni pari E4:PHONE?
features.drop(to_drop, axis=1, inplace=True)
# Preserve esm cols if deleted (has to come after drop cols operations)
@ -172,10 +183,11 @@ def straw_cleaning(sensor_data_files, provider):
graph_bf_af(features, "11correlation_drop")
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
if features.isna().any().any():
raise ValueError
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
sys.exit()
return features
def impute(df, method='zero'):
@ -192,7 +204,7 @@ def impute(df, method='zero'):
'knn': k_nearest(df)
}[method]
def graph_bf_af(features, phase_name, plt_flag=False):
def graph_bf_af(features, phase_name, plt_flag=True):
if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
@ -201,5 +213,5 @@ def graph_bf_af(features, phase_name, plt_flag=False):
print(f"\n-------------{phase_name}-------------")
print("Rows number:", features.shape[0])
print("Columns number:", len(features.columns))
print("NaN values:", features.isna().sum().sum())
print("---------------------------------------------\n")

View File

@ -10,12 +10,15 @@ def calculate_empatica_data_yield(features):
datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S')
tseg_duration = (datetime_end - datetime_start).total_seconds()
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration
features["temp_data_yield"] = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration
features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration \
if 'empatica_accelerometer_cr_SO_windowsCount' in features else 0
features["temp_data_yield"] = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration \
if 'empatica_temperature_cr_SO_windowsCount' in features else 0
features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration \
if 'empatica_electrodermal_activity_cr_SO_windowsCount' in features else 0
features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration \
if 'empatica_inter_beat_interval_cr_SO_windowsCount' in features else 0
# TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ...
empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield']
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0)
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)