rapids/src/features/all_cleaning_individual/straw/main.py

import pandas as pd
import numpy as np
import math, sys

def straw_cleaning(sensor_data_files, provider):
    
    features = pd.read_csv(sensor_data_files["sensor_data"][0])

    # Impute selected features event
    impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
    if impute_phone_features["COMPUTE"]:
        if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
            raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
        
        phone_cols = [col for col in features if \
            col.startswith('phone_applications_foreground_rapids_') or
            col.startswith('phone_battery_rapids_') or
            col.startswith('phone_calls_rapids_') or
            col.startswith('phone_keyboard_rapids_') or
            col.startswith('phone_messages_rapids_') or
            col.startswith('phone_screen_rapids_') or
            col.startswith('phone_wifi_')]

        mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
        features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"])

    # Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
    data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
    data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit

    if not data_yield_column in features.columns:
        raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
        
    features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]

    # Remove cols if threshold of NaN values is passed
    features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]

    # Remove cols where variance is 0
    if provider["COLS_VAR_THRESHOLD"]:
        features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
    
    # Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?
    drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
    if drop_corr_features["COMPUTE"]:
        numerical_cols = features.select_dtypes(include=np.number).columns.tolist()

        cor_matrix = features[numerical_cols].corr(method='spearman').abs()

        upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))

        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]

        # Tukaj je še neka validacija s thresholdom, ampak ne razumem R kode "valid_pairs" 
        features.drop(to_drop, axis=1, inplace=True)

    # Remove rows if threshold of NaN values is passed
    min_count =  math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # min not nan values in row
    features.dropna(axis=0, thresh=min_count, inplace=True)

    return features

def impute(df, method='zero'):
    df.loc[:, df.isna().all()] = df.loc[:, df.isna().all()].fillna(0) # if column contains only NaN values impute it with 0
    return { # rest of the columns should be imputed with the selected method
        'zero': df.fillna(0),
        'mean': df.fillna(df.mean()),
        'median': df.fillna(df.median()),
        'k-nearest': None # To-Do
    }[method]
Squashed commit of the following: commit 31a47a5ee4569264e39d7c445525a6e64bb7700a Author: Primoz <sisko.primoz@gmail.com> Date: Wed Jul 20 13:49:22 2022 +0000 Environment version change. commit 5b274ed8993f58e783bda6d82fce936764209c28 Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 19 16:10:07 2022 +0000 Enabled cleaning for all participants + standardization files. commit 203fdb31e0f3c647ef8c8a60cb9531831b7ab924 Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 19 14:14:51 2022 +0000 Features cleaning fixes after testing. Visualization script for phone features values. commit 176178d73b154c30b9eb9eb4a67514f00d6a924e Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 19 09:05:14 2022 +0000 Revert "Necessary config changes." This reverts commit 6ec1ef50430d2e1f5ce4670d505d5e84ac47f0a0. commit 26ea6512c9d512f95837e7b047fe510c1d196403 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 18 13:19:47 2022 +0000 Adding cleaning function condition and cleaning functionality. commit 575c29eef9c21e6f2d7832871e73bc0941643734 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 18 12:51:56 2022 +0000 Translation of the cleaning individual RAPIDS function from R to py. commit 6ec1ef50430d2e1f5ce4670d505d5e84ac47f0a0 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 18 12:02:18 2022 +0000 Necessary config changes. commit b5669f51612fbd8378848615d639677851ab032f Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 15:26:00 2022 +0000 Modified snakemake rule to dynamically choose script extention. commit 66636be1e8ae4828228b37c59b9df1faf3fc3d3d Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 14:43:08 2022 +0000 Trying to modify the snakefile rule to execute scripts in two languages depended on the provider. commit 574778b00f3cbb368ef4bc74de15cf5070c65ea9 Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 09:49:41 2022 +0000 gitignore: adding required files so that RAPIDS can be run successfully. commit 71018ab178256970535e78961602ab8c7f0ebb14 Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 15 08:34:19 2022 +0000 Standardization bug fixes commit 6253c470a624e6bfbb02e0c453b652452eb2dbbc Author: Primoz <sisko.primoz@gmail.com> Date: Thu Jul 14 15:28:02 2022 +0000 Seperate rules for empatica vs. nonempatica standardization. Parameter in config that controls the creation of standardized merged files for individual and all participants.. commit 90f902778565e0896d3bae22ae8551be8b487e67 Author: Primoz <sisko.primoz@gmail.com> Date: Tue Jul 12 14:23:03 2022 +0000 Preparing for final csvs' standardization. commit d25dde3998786a9a582f5cda544ee104386778f9 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 12:08:47 2022 +0000 Revert "Changes in config to be reverted." This reverts commit bea7608e7095021fb7c53a9afa07074448fe4313. commit 6b23e70857e63deda98eb98d190af9090626c84b Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 12:08:26 2022 +0000 Enabled standardization for rest (previously active) phone features. Testing still needed. commit 8ec58a6f34ba3d42e5cc71d26e6d91837472ca5f Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 09:07:55 2022 +0000 Enabled standardization for phone calls. All steps completed and tested. commit bea7608e7095021fb7c53a9afa07074448fe4313 Author: Primoz <sisko.primoz@gmail.com> Date: Mon Jul 11 07:47:51 2022 +0000 Changes in config to be reverted. commit 4e84ca0e51bf709bff56fd09437b95310ec6bedd Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 8 14:11:24 2022 +0000 Standardization for the rest of the features. commit cc581aa788e3d5c17131af8f3d5dd6b0c3b5aff7 Author: Primoz <sisko.primoz@gmail.com> Date: Fri Jul 8 14:11:08 2022 +0000 README update again 2022-07-20 15:51:22 +02:00			`import pandas as pd`
			`import numpy as np`
			`import math, sys`

			`def straw_cleaning(sensor_data_files, provider):`

			`features = pd.read_csv(sensor_data_files["sensor_data"][0])`

			`# Impute selected features event`
			`impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]`
			`if impute_phone_features["COMPUTE"]:`
			`if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:`
			`raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")`

			`phone_cols = [col for col in features if \`
			`col.startswith('phone_applications_foreground_rapids_') or`
			`col.startswith('phone_battery_rapids_') or`
			`col.startswith('phone_calls_rapids_') or`
			`col.startswith('phone_keyboard_rapids_') or`
			`col.startswith('phone_messages_rapids_') or`
			`col.startswith('phone_screen_rapids_') or`
			`col.startswith('phone_wifi_')]`

			`mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']`
			`features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"])`

			`# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold`
			`data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()`
			`data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit`

			`if not data_yield_column in features.columns:`
			`raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")`

			`features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]`

			`# Remove cols if threshold of NaN values is passed`
			`features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]`

			`# Remove cols where variance is 0`
			`if provider["COLS_VAR_THRESHOLD"]:`
			`features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)`

			`# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?`
			`drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]`
			`if drop_corr_features["COMPUTE"]:`
			`numerical_cols = features.select_dtypes(include=np.number).columns.tolist()`

			`cor_matrix = features[numerical_cols].corr(method='spearman').abs()`

			`upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))`

			`to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]`

			`# Tukaj je še neka validacija s thresholdom, ampak ne razumem R kode "valid_pairs"`
			`features.drop(to_drop, axis=1, inplace=True)`

			`# Remove rows if threshold of NaN values is passed`
			`min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # min not nan values in row`
			`features.dropna(axis=0, thresh=min_count, inplace=True)`

			`return features`

			`def impute(df, method='zero'):`
			`df.loc[:, df.isna().all()] = df.loc[:, df.isna().all()].fillna(0) # if column contains only NaN values impute it with 0`
			`return { # rest of the columns should be imputed with the selected method`
			`'zero': df.fillna(0),`
			`'mean': df.fillna(df.mean()),`
			`'median': df.fillna(df.median()),`
			`'k-nearest': None # To-Do`
			`}[method]`