Code cleaning, reworking cleaning individual based on changes in overall script. Changes in thresholds.

notes
Primoz 2022-09-30 10:04:07 +00:00
parent 7ac7cd5a37
commit 44531c6d94
3 changed files with 32 additions and 29 deletions

View File

@ -667,7 +667,7 @@ ALL_CLEANING_INDIVIDUAL:
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.4 # set to 0 to disable PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.4 # set to 0 to disable
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.25 # set to 0 to disable EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.25 # set to 0 to disable
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable ROWS_NAN_THRESHOLD: 0.33 # set to 1 to disable
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
DROP_HIGHLY_CORRELATED_FEATURES: DROP_HIGHLY_CORRELATED_FEATURES:
@ -699,8 +699,8 @@ ALL_CLEANING_OVERALL:
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.4 # set to 0 to disable PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.4 # set to 0 to disable
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.25 # set to 0 to disable EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.25 # set to 0 to disable
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable ROWS_NAN_THRESHOLD: 0.33 # set to 1 to disable
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
DROP_HIGHLY_CORRELATED_FEATURES: DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True COMPUTE: True

View File

@ -1,6 +1,6 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import math, sys import math, sys, random
import yaml import yaml
from sklearn.impute import KNNImputer from sklearn.impute import KNNImputer
@ -11,6 +11,8 @@ import seaborn as sns
sys.path.append('/rapids/') sys.path.append('/rapids/')
from src.features import empatica_data_yield as edy from src.features import empatica_data_yield as edy
pd.set_option('display.max_columns', 20)
def straw_cleaning(sensor_data_files, provider): def straw_cleaning(sensor_data_files, provider):
features = pd.read_csv(sensor_data_files["sensor_data"][0]) features = pd.read_csv(sensor_data_files["sensor_data"][0])
@ -43,7 +45,7 @@ def straw_cleaning(sensor_data_files, provider):
# Drop rows where empatica data yield is less then given threshold # Drop rows where empatica data yield is less then given threshold
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]: if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True) features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES? # (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
@ -66,26 +68,39 @@ def straw_cleaning(sensor_data_files, provider):
"timeoflastuse" in col or "timeoflastuse" in col or
"timefirstcall" in col or "timefirstcall" in col or
"timelastcall" in col or "timelastcall" in col or
"firstuseafter" in col or
"timefirstmessages" in col or "timefirstmessages" in col or
"timelastmessages" in col] "timelastmessages" in col]
features[impute_w_hn] = impute(features[impute_w_hn], method="high_number") features[impute_w_hn] = impute(features[impute_w_hn], method="high_number")
# Impute phone locations with median # Impute special case (mostcommonactivity)
impute_locations = [col for col in features.columns if "phone_locations_" in col] impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
features[impute_locations] = impute(features[impute_locations], method="median") features[impute_w_sn] = features[impute_w_sn].fillna(4) # Special case of imputation
# Impute remaining phone features with 0 # Impute selected phone features with 0
impute_rest = [col for col in features.columns if "phone_" in col] impute_zero = [col for col in features if \
col.startswith('phone_applications_foreground_rapids_') or
col.startswith('phone_battery_rapids_') or
col.startswith('phone_bluetooth_rapids_') or
col.startswith('phone_light_rapids_') or
col.startswith('phone_calls_rapids_') or
col.startswith('phone_messages_rapids_') or
col.startswith('phone_screen_rapids_') or
col.startswith('phone_wifi_visible')]
features[impute_locations] = impute(features[impute_locations], method="zero") features[impute_locations] = impute(features[impute_locations], method="zero")
## (5) STANDARDIZATION ## (5) STANDARDIZATION
if provider["STANDARDIZATION"]: if provider["STANDARDIZATION"]:
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)]) features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
graph_bf_af(features[impute_locations], "knn_before")
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD # (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
impute_cols = [col for col in features.columns if col not in excluded_columns] impute_cols = [col for col in features.columns if col not in excluded_columns]
features[impute_cols] = impute(features[impute_cols], method="knn") features[impute_cols] = impute(features[impute_cols], method="knn")
graph_bf_af(features[impute_locations], "knn_after")
# (7) REMOVE COLS WHERE VARIANCE IS 0 # (7) REMOVE COLS WHERE VARIANCE IS 0
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
@ -139,5 +154,4 @@ def graph_bf_af(features, phase_name):
print(features) print(features)
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
plt.savefig(f'features_individual_nans_{phase_name}.png', bbox_inches='tight') plt.savefig(f'features_individual_nans_{phase_name}.png', bbox_inches='tight')

View File

@ -1,7 +1,6 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import math, sys, random import math, sys, random
import typing
import yaml import yaml
from sklearn.impute import KNNImputer from sklearn.impute import KNNImputer
@ -95,16 +94,16 @@ def straw_cleaning(sensor_data_files, provider):
# # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1) # # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1)
# # features.at[0,'pid'] = "p01" # features.at[0,'pid'] = "p01"
# # features.at[1,'pid'] = "p01" # features.at[1,'pid'] = "p01"
# # features.at[2,'pid'] = "p02" # features.at[2,'pid'] = "p02"
# # features.at[3,'pid'] = "p02" # features.at[3,'pid'] = "p02"
# # graph_bf_af(features[impute_locations], "phoneloc_before") # graph_bf_af(features[impute_locations], "phoneloc_before")
# features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations] # features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
## (5) STANDARDIZATION # (5) STANDARDIZATION
if provider["STANDARDIZATION"]: if provider["STANDARDIZATION"]:
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \ features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: 0 if (x.std() == 0) else (x - x.mean()) / x.std()) features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: 0 if (x.std() == 0) else (x - x.mean()) / x.std())
@ -170,14 +169,4 @@ def graph_bf_af(features, phase_name):
print(features) print(features)
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
plt.savefig(f'features_overall_nans_{phase_name}.png', bbox_inches='tight') plt.savefig(f'features_overall_nans_{phase_name}.png', bbox_inches='tight')
class SklearnWrapper:
def __init__(self, transform: typing.Callable):
self.transform = transform
def __call__(self, df):
transformed = self.transform.fit_transform(df.values)
return pd.DataFrame(transformed, columns=df.columns, index=df.index)