rapids/src/features/all_cleaning_individual/straw/main.py

73 lines
3.7 KiB
Python
Raw Normal View History

Squashed commit of the following: commit 31a47a5ee4569264e39d7c445525a6e64bb7700a Author: Primoz <sisko.primoz@gmail.com> Date: Wed Jul 20 13:49:22 2022 +0000 Environment version change. commit 5b274ed8993f58e783bda6d82fce936764209c28 Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 19 16:10:07 2022 +0000 Enabled cleaning for all participants + standardization files. commit 203fdb31e0f3c647ef8c8a60cb9531831b7ab924 Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 19 14:14:51 2022 +0000 Features cleaning fixes after testing. Visualization script for phone features values. commit 176178d73b154c30b9eb9eb4a67514f00d6a924e Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 19 09:05:14 2022 +0000 Revert "Necessary config changes." This reverts commit 6ec1ef50430d2e1f5ce4670d505d5e84ac47f0a0. commit 26ea6512c9d512f95837e7b047fe510c1d196403 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 18 13:19:47 2022 +0000 Adding cleaning function condition and cleaning functionality. commit 575c29eef9c21e6f2d7832871e73bc0941643734 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 18 12:51:56 2022 +0000 Translation of the cleaning individual RAPIDS function from R to py. commit 6ec1ef50430d2e1f5ce4670d505d5e84ac47f0a0 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 18 12:02:18 2022 +0000 Necessary config changes. commit b5669f51612fbd8378848615d639677851ab032f Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 15:26:00 2022 +0000 Modified snakemake rule to dynamically choose script extention. commit 66636be1e8ae4828228b37c59b9df1faf3fc3d3d Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 14:43:08 2022 +0000 Trying to modify the snakefile rule to execute scripts in two languages depended on the provider. commit 574778b00f3cbb368ef4bc74de15cf5070c65ea9 Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 09:49:41 2022 +0000 gitignore: adding required files so that RAPIDS can be run successfully. commit 71018ab178256970535e78961602ab8c7f0ebb14 Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 08:34:19 2022 +0000 Standardization bug fixes commit 6253c470a624e6bfbb02e0c453b652452eb2dbbc Author: Primoz <sisko.primoz@gmail.com> Date: Thu Jul 14 15:28:02 2022 +0000 Seperate rules for empatica vs. nonempatica standardization. Parameter in config that controls the creation of standardized merged files for individual and all participants.. commit 90f902778565e0896d3bae22ae8551be8b487e67 Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 12 14:23:03 2022 +0000 Preparing for final csvs' standardization. commit d25dde3998786a9a582f5cda544ee104386778f9 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 12:08:47 2022 +0000 Revert "Changes in config to be reverted." This reverts commit bea7608e7095021fb7c53a9afa07074448fe4313. commit 6b23e70857e63deda98eb98d190af9090626c84b Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 12:08:26 2022 +0000 Enabled standardization for rest (previously active) phone features. Testing still needed. commit 8ec58a6f34ba3d42e5cc71d26e6d91837472ca5f Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 09:07:55 2022 +0000 Enabled standardization for phone calls. All steps completed and tested. commit bea7608e7095021fb7c53a9afa07074448fe4313 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 07:47:51 2022 +0000 Changes in config to be reverted. commit 4e84ca0e51bf709bff56fd09437b95310ec6bedd Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 8 14:11:24 2022 +0000 Standardization for the rest of the features. commit cc581aa788e3d5c17131af8f3d5dd6b0c3b5aff7 Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 8 14:11:08 2022 +0000 README update again
2022-07-20 15:51:22 +02:00
import pandas as pd
import numpy as np
import math, sys
def straw_cleaning(sensor_data_files, provider):
features = pd.read_csv(sensor_data_files["sensor_data"][0])
# Impute selected features event
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
if impute_phone_features["COMPUTE"]:
if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
phone_cols = [col for col in features if \
col.startswith('phone_applications_foreground_rapids_') or
col.startswith('phone_battery_rapids_') or
col.startswith('phone_calls_rapids_') or
col.startswith('phone_keyboard_rapids_') or
col.startswith('phone_messages_rapids_') or
col.startswith('phone_screen_rapids_') or
col.startswith('phone_wifi_')]
mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"])
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit
if not data_yield_column in features.columns:
raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
# Remove cols if threshold of NaN values is passed
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
# Remove cols where variance is 0
if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
if drop_corr_features["COMPUTE"]:
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
cor_matrix = features[numerical_cols].corr(method='spearman').abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]
# Tukaj je še neka validacija s thresholdom, ampak ne razumem R kode "valid_pairs"
features.drop(to_drop, axis=1, inplace=True)
# Remove rows if threshold of NaN values is passed
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # min not nan values in row
features.dropna(axis=0, thresh=min_count, inplace=True)
return features
def impute(df, method='zero'):
df.loc[:, df.isna().all()] = df.loc[:, df.isna().all()].fillna(0) # if column contains only NaN values impute it with 0
return { # rest of the columns should be imputed with the selected method
'zero': df.fillna(0),
'mean': df.fillna(df.mean()),
'median': df.fillna(df.median()),
'k-nearest': None # To-Do
}[method]