From 832eb6137ed89c1a54a8cb2328151d0d35063ded Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 19 Jan 2022 12:53:03 +0100 Subject: [PATCH] ML with RAPIDS and missing values. --- exploration/ex_ml_pipeline.py | 106 ++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 12 deletions(-) diff --git a/exploration/ex_ml_pipeline.py b/exploration/ex_ml_pipeline.py index 39bbdbd..57346a2 100644 --- a/exploration/ex_ml_pipeline.py +++ b/exploration/ex_ml_pipeline.py @@ -27,6 +27,7 @@ import yaml from pyprojroot import here from sklearn import linear_model from sklearn.model_selection import LeaveOneGroupOut, cross_val_score +from sklearn.impute import SimpleImputer nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: @@ -266,33 +267,114 @@ with open(here("rapids/config.yaml"), "r") as file: rapids_config = yaml.safe_load(file) # %% -for key in rapids_config.keys(): +for key in rapids_config.keys(): if isinstance(rapids_config[key], dict): # Remove top-level configs - if "PROVIDERS" in rapids_config[key]: # Retain features (that have providers) + if ("PROVIDERS" in rapids_config[key]): # Retain features (that have providers) if rapids_config[key]["PROVIDERS"]: # Remove non-implemented features for provider in rapids_config[key]["PROVIDERS"]: - if rapids_config[key]["PROVIDERS"][provider][ - "COMPUTE" - ]: # Check that the features were actually calculated + if rapids_config[key]["PROVIDERS"][provider]["COMPUTE"]: # Check that the features were actually calculated if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]: print(key) print(provider) print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"]) # %% -features_rapids = pd.read_csv( - here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), - parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"], -) +features_rapids = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) # %% features_rapids.columns # %% -features_rapids = features_rapids.assign( - date_lj=lambda x: x.local_segment_start_datetime.dt.date -) +features_rapids = features_rapids.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date) # %% features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)") features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"]) +features_rapids.set_index(["participant_id", "date_lj"], inplace=True) + +# %% +with open("../machine_learning/config/minimal_labels.yaml", "r") as file: + labels_params = yaml.safe_load(file) + +# %% +labels = machine_learning.labels.Labels(**labels_params) +labels.set_participants_label("all") + +# %% +labels.aggregate_labels(cached=True) +labels_read = labels.get_aggregated_labels() +labels_read = labels_read.reset_index() +labels_read["date_lj"] = labels_read["date_lj"].dt.date +labels_read.set_index(["participant_id", "date_lj"], inplace=True) +# date_lj column is parsed as a date and represented as Timestamp, when read from csv. +# When calculated, it is represented as date. + +# %% +features_rapids.shape + +# %% +labels_read.shape + +# %% +features_labels = features_rapids.join(labels_read, how="inner").reset_index() + +# %% +features_labels.shape + +# %% +features_labels.columns + +# %% +imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + +# %% +feature_columns = features_labels.columns[6:-3] +label_column = "NA" +group_column = "pid" + +# %% +lin_reg_rapids = linear_model.LinearRegression() +logo = LeaveOneGroupOut() +logo.get_n_splits( + features_labels[feature_columns], + features_labels[label_column], + groups=features_labels[group_column], +) + +# %% +cross_val_score( + lin_reg_rapids, + X=imputer.fit_transform(features_labels[feature_columns]), + y=features_labels[label_column], + groups=features_labels[group_column], + cv=logo, + n_jobs=-1, + scoring="r2", +) + +# %% +sns.set(rc={"figure.figsize":(16, 8)}) +sns.heatmap(features_labels[feature_columns].isna(), cbar=False) + +# %% [markdown] tags=[] +# ```yaml +# ALL_CLEANING_INDIVIDUAL: +# PROVIDERS: +# RAPIDS: +# COMPUTE: True +# IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below +# COMPUTE: True +# MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero. +# COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable +# COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance +# ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable +# DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES +# DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable +# DROP_HIGHLY_CORRELATED_FEATURES: +# COMPUTE: False +# MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 +# CORR_THRESHOLD: 0.95 +# SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R +# ``` + +# %%