diff --git a/config.yaml b/config.yaml index 770305e4..f03ca101 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,7 @@ ######################################################################################################################## # See https://www.rapids.science/latest/setup/configuration/#participant-files -PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107'] +PIDS: ['p01'] #['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107'] # See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files CREATE_PARTICIPANT_FILES: @@ -690,7 +690,7 @@ ALL_CLEANING_INDIVIDUAL: COMPUTE: False TYPE: median # options: zero, mean, median or k-nearest MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 - COLS_NAN_THRESHOLD: 1 # set to 1 to disable + COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 1 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES @@ -725,7 +725,7 @@ ALL_CLEANING_OVERALL: COMPUTE: False TYPE: median # options: zero, mean, median or k-nearest MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 - COLS_NAN_THRESHOLD: 1 # set to 1 to disable + COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 1 # set to 1 to disable DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES diff --git a/data/external/participant_files/p01.yaml b/data/external/participant_files/p01.yaml index fe394a76..0462a255 100644 --- a/data/external/participant_files/p01.yaml +++ b/data/external/participant_files/p01.yaml @@ -1,9 +1,9 @@ PHONE: - DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524] # the participant's AWARE device id - PLATFORMS: [android] # or ios - LABEL: MyTestP01 # any string - START_DATE: 2020-01-01 # this can also be empty - END_DATE: 2021-01-01 # this can also be empty + DEVICE_IDS: [70cc5183-97d4-4678-b81e-a34e491e2868,d5bbb2ab-2d60-4e72-a636-17655395c401,93fae5bc-e5a9-4751-b768-fd55c821f126] + PLATFORMS: [android,android,android] + LABEL: uploader_57312 + START_DATE: 2020-09-24 11:56:45 + END_DATE: 2020-10-24 19:19:37 EMPATICA: DEVICE_IDS: [empatica1] LABEL: test01 diff --git a/features_nans.png b/features_nans.png new file mode 100644 index 00000000..eef88765 Binary files /dev/null and b/features_nans.png differ diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 619af4a2..f2c1bcd0 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -1,20 +1,27 @@ import pandas as pd import numpy as np import math, sys -from sklearn.impute import KNNImputer +import yaml -def straw_cleaning(sensor_data_files, provider, target=None): +from sklearn.impute import KNNImputer +import matplotlib.pyplot as plt +import seaborn as sns + +def straw_cleaning(sensor_data_files, provider): features = pd.read_csv(sensor_data_files["sensor_data"][0]) + esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns - esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target columns + with open('config.yaml', 'r') as stream: + config = yaml.load(stream, Loader=yaml.FullLoader) + + #Filter-out all rows that do not have the target column available + if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: + target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config + features = features[features['phone_esm_straw_' + target].notna()].reset_index() + + test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col] - #Filter all rows that do not have the target column available - # get target from config or function parameter - if target is None: - features = features[features[esm_cols[0]].notna()] - else: - features = features[features['phone_esm_straw_' + target].notna()] # TODO: reorder the cleaning steps so it makes sense for the analysis # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this @@ -58,9 +65,9 @@ def straw_cleaning(sensor_data_files, provider, target=None): if provider["DATA_YIELD_RATIO_THRESHOLD"]: features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]] - # Remove cols if threshold of NaN values is passed - features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] - + # Remove cols if threshold of NaN values is passed (should be <= if even all NaN columns must be preserved) + features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] + # Remove cols where variance is 0 if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) @@ -89,6 +96,12 @@ def straw_cleaning(sensor_data_files, provider, target=None): min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row features.dropna(axis=0, thresh=min_count, inplace=True) + sns.set(rc={"figure.figsize":(16, 8)}) + sns.heatmap(features.isna(), cbar=False) + plt.savefig(f'features_nans.png', bbox_inches='tight') + + sys.exit() + return features def impute(df, method='zero'):