From a8fd96d2f170ea07121d4a026ffc09b8241ce547 Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 23 Aug 2022 16:41:41 +0200 Subject: [PATCH] Add analysis using RAPIDS. --- exploration/ex_ml_pipeline.py | 1131 +++++++++++++++++++++++++++-- exploration/expl_esm_labels.py | 10 +- statistical_analysis/adherence.py | 22 +- 3 files changed, 1100 insertions(+), 63 deletions(-) diff --git a/exploration/ex_ml_pipeline.py b/exploration/ex_ml_pipeline.py index 328513a..56cf57d 100644 --- a/exploration/ex_ml_pipeline.py +++ b/exploration/ex_ml_pipeline.py @@ -6,14 +6,14 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.12.0 +# jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- -# %% +# %% jupyter={"source_hidden": true} # %matplotlib inline import datetime import importlib @@ -21,11 +21,15 @@ import os import sys import numpy as np +import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml -from sklearn import linear_model +from pyprojroot import here +from sklearn import linear_model, svm, kernel_ridge, gaussian_process from sklearn.model_selection import LeaveOneGroupOut, cross_val_score +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.impute import SimpleImputer nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: @@ -35,14 +39,14 @@ import machine_learning.features_sensor import machine_learning.labels import machine_learning.model -# %% +# %% jupyter={"source_hidden": true} import participants.query_db from features import esm, helper, proximity # %% [markdown] tags=[] # # 1. Get the relevant data -# %% +# %% jupyter={"source_hidden": true} participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) @@ -52,11 +56,11 @@ ptcp_2 = participants_inactive_usernames[0:2] # %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[] # ## 1.1 Labels -# %% +# %% jupyter={"source_hidden": true} df_esm = esm.get_esm_data(ptcp_2) df_esm_preprocessed = esm.preprocess_esm(df_esm) -# %% +# %% jupyter={"source_hidden": true} df_esm_PANAS = df_esm_preprocessed[ (df_esm_preprocessed["questionnaire_id"] == 8) | (df_esm_preprocessed["questionnaire_id"] == 9) @@ -66,7 +70,7 @@ df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS) # %% [markdown] # ## 1.2 Sensor data -# %% +# %% jupyter={"source_hidden": true} df_proximity = proximity.get_proximity_data(ptcp_2) df_proximity = helper.get_date_from_timestamp(df_proximity) df_proximity = proximity.recode_proximity(df_proximity) @@ -77,7 +81,7 @@ df_proximity = proximity.recode_proximity(df_proximity) # %% [markdown] # # 2. Grouping/segmentation -# %% +# %% jupyter={"source_hidden": true} df_esm_PANAS_daily_means = ( df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"]) .esm_user_answer_numeric.agg("mean") @@ -85,7 +89,7 @@ df_esm_PANAS_daily_means = ( .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) ) -# %% +# %% jupyter={"source_hidden": true} df_esm_PANAS_daily_means = ( df_esm_PANAS_daily_means.pivot( index=["participant_id", "date_lj"], @@ -98,16 +102,16 @@ df_esm_PANAS_daily_means = ( ) -# %% +# %% jupyter={"source_hidden": true} df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"]) -# %% +# %% jupyter={"source_hidden": true} df_proximity_daily_counts # %% [markdown] # # 3. Join features (and export to csv?) -# %% +# %% jupyter={"source_hidden": true} df_full_data_daily_means = df_esm_PANAS_daily_means.join( df_proximity_daily_counts ).reset_index() @@ -115,13 +119,13 @@ df_full_data_daily_means = df_esm_PANAS_daily_means.join( # %% [markdown] # # 4. Machine learning model and parameters -# %% +# %% jupyter={"source_hidden": true} lin_reg_proximity = linear_model.LinearRegression() # %% [markdown] # ## 4.1 Validation method -# %% +# %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], @@ -132,7 +136,7 @@ logo.get_n_splits( # %% [markdown] # ## 4.2 Fit results (export?) -# %% +# %% jupyter={"source_hidden": true} cross_val_score( lin_reg_proximity, df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], @@ -143,13 +147,13 @@ cross_val_score( scoring="r2", ) -# %% +# %% jupyter={"source_hidden": true} lin_reg_proximity.fit( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], ) -# %% +# %% jupyter={"source_hidden": true} lin_reg_proximity.score( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], @@ -158,78 +162,78 @@ lin_reg_proximity.score( # %% [markdown] # # Merging these into a pipeline -# %% +# %% jupyter={"source_hidden": true} from machine_learning import features_sensor, labels, model, pipeline -# %% +# %% jupyter={"source_hidden": true} importlib.reload(features_sensor) -# %% +# %% jupyter={"source_hidden": true} with open("../machine_learning/config/minimal_features.yaml", "r") as file: sensor_features_params = yaml.safe_load(file) print(sensor_features_params) -# %% +# %% jupyter={"source_hidden": true} sensor_features = machine_learning.features_sensor.SensorFeatures( **sensor_features_params ) sensor_features.data_types -# %% +# %% jupyter={"source_hidden": true} sensor_features.set_participants_label("nokia_0000003") -# %% +# %% jupyter={"source_hidden": true} sensor_features.data_types = ["proximity", "communication"] sensor_features.participants_usernames = ptcp_2 -# %% +# %% jupyter={"source_hidden": true} sensor_features.get_sensor_data("proximity") -# %% +# %% jupyter={"source_hidden": true} sensor_features.set_sensor_data() -# %% +# %% jupyter={"source_hidden": true} sensor_features.get_sensor_data("proximity") -# %% +# %% jupyter={"source_hidden": true} sensor_features.calculate_features(cached=False) features_all_calculated = sensor_features.get_features("all", "all") -# %% +# %% jupyter={"source_hidden": true} sensor_features.calculate_features(cached=True) features_all_read = sensor_features.get_features("all", "all") -# %% +# %% jupyter={"source_hidden": true} features_all_read = features_all_read.reset_index() features_all_read["date_lj"] = features_all_read["date_lj"].dt.date features_all_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. -# %% +# %% jupyter={"source_hidden": true} np.isclose(features_all_read, features_all_calculated).all() -# %% +# %% jupyter={"source_hidden": true} with open("../machine_learning/config/minimal_labels.yaml", "r") as file: labels_params = yaml.safe_load(file) -# %% +# %% jupyter={"source_hidden": true} labels = machine_learning.labels.Labels(**labels_params) labels.participants_usernames = ptcp_2 labels.set_participants_label("nokia_0000003") labels.questionnaires -# %% +# %% jupyter={"source_hidden": true} labels.set_labels() -# %% +# %% jupyter={"source_hidden": true} labels.get_labels("PANAS") -# %% +# %% jupyter={"source_hidden": true} labels.aggregate_labels(cached=False) labels_calculated = labels.get_aggregated_labels() -# %% +# %% jupyter={"source_hidden": true} labels.aggregate_labels(cached=True) labels_read = labels.get_aggregated_labels() labels_read = labels_read.reset_index() @@ -238,10 +242,10 @@ labels_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. -# %% +# %% jupyter={"source_hidden": true} np.isclose(labels_read, labels_calculated).all() -# %% +# %% jupyter={"source_hidden": true} model_validation = machine_learning.model.ModelValidation( sensor_features.get_features("all", "all"), labels.get_aggregated_labels(), @@ -251,10 +255,1053 @@ model_validation = machine_learning.model.ModelValidation( model_validation.model = linear_model.LinearRegression() model_validation.set_cv_method() -# %% +# %% jupyter={"source_hidden": true} model_validation.cross_validate() -# %% +# %% jupyter={"source_hidden": true} model_validation.groups -# %% +# %% [markdown] +# # Use RAPIDS + +# %% jupyter={"source_hidden": true} +with open(here("rapids/config.yaml"), "r") as file: + rapids_config = yaml.safe_load(file) + +# %% jupyter={"source_hidden": true} +for key in rapids_config.keys(): + if isinstance(rapids_config[key], dict): # Remove top-level configs + if ("PROVIDERS" in rapids_config[key]): # Retain features (that have providers) + if rapids_config[key]["PROVIDERS"]: # Remove non-implemented features + for provider in rapids_config[key]["PROVIDERS"]: + if rapids_config[key]["PROVIDERS"][provider]["COMPUTE"]: # Check that the features were actually calculated + if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]: + print(key) + print(provider) + print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"]) + +# %% jupyter={"source_hidden": true} +features_rapids = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) + +# %% jupyter={"source_hidden": true} +features_rapids.columns + +# %% jupyter={"source_hidden": true} +features_rapids = features_rapids.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date) + +# %% jupyter={"source_hidden": true} +features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)") +features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"]) +features_rapids.set_index(["participant_id", "date_lj"], inplace=True) + +# %% jupyter={"source_hidden": true} +with open("../machine_learning/config/minimal_labels.yaml", "r") as file: + labels_params = yaml.safe_load(file) + +# %% jupyter={"source_hidden": true} +labels = machine_learning.labels.Labels(**labels_params) +labels.set_participants_label("all") + +# %% jupyter={"source_hidden": true} +labels.aggregate_labels(cached=True) +labels_read = labels.get_aggregated_labels() +labels_read = labels_read.reset_index() +labels_read["date_lj"] = labels_read["date_lj"].dt.date +labels_read.set_index(["participant_id", "date_lj"], inplace=True) +# date_lj column is parsed as a date and represented as Timestamp, when read from csv. +# When calculated, it is represented as date. + +# %% jupyter={"source_hidden": true} +features_rapids.shape + +# %% jupyter={"source_hidden": true} +labels_read.shape + +# %% jupyter={"source_hidden": true} +features_labels = features_rapids.join(labels_read, how="inner").reset_index() + +# %% jupyter={"source_hidden": true} +features_labels.shape + +# %% jupyter={"source_hidden": true} +features_labels.columns + +# %% jupyter={"source_hidden": true} +imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + +# %% jupyter={"source_hidden": true} +feature_columns = features_labels.columns[6:-3] +label_column = "NA" +group_column = "pid" + +# %% jupyter={"source_hidden": true} +lin_reg_rapids = linear_model.LinearRegression() +logo = LeaveOneGroupOut() +logo.get_n_splits( + features_labels[feature_columns], + features_labels[label_column], + groups=features_labels[group_column], +) + +# %% jupyter={"source_hidden": true} +cross_val_score( + lin_reg_rapids, + X=imputer.fit_transform(features_labels[feature_columns]), + y=features_labels[label_column], + groups=features_labels[group_column], + cv=logo, + n_jobs=-1, + scoring="r2", +) + +# %% tags=[] +sns.set(rc={"figure.figsize":(16, 8)}) +sns.heatmap(features_labels[feature_columns].isna(), cbar=False) + +# %% [markdown] tags=[] +# ```yaml +# ALL_CLEANING_INDIVIDUAL: +# PROVIDERS: +# RAPIDS: +# COMPUTE: True +# IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below +# COMPUTE: True +# MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero. +# COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable +# COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance +# ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable +# DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES +# DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable +# DROP_HIGHLY_CORRELATED_FEATURES: +# COMPUTE: False +# MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 +# CORR_THRESHOLD: 0.95 +# SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R +# ``` + +# %% jupyter={"source_hidden": true} +features_rapids_cleaned = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) +features_rapids_cleaned = features_rapids_cleaned.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date) +features_rapids_cleaned["participant_id"] = features_rapids_cleaned["pid"].str.extract("(\d+)") +features_rapids_cleaned["participant_id"] = pd.to_numeric(features_rapids_cleaned["participant_id"]) +features_rapids_cleaned.set_index(["participant_id", "date_lj"], inplace=True) + +# %% jupyter={"source_hidden": true} +features_cleaned_labels = features_rapids_cleaned.join(labels_read, how="inner").reset_index() +feature_clean_columns = features_cleaned_labels.columns[6:-3] + +# %% jupyter={"source_hidden": true} +print(feature_columns.shape) +print(feature_clean_columns.shape) + +# %% tags=[] +sns.set(rc={"figure.figsize":(16, 8)}) +sns.heatmap(features_cleaned_labels[feature_clean_columns].isna(), cbar=False) + +# %% jupyter={"source_hidden": true} +lin_reg_rapids_clean = linear_model.LinearRegression() +logo = LeaveOneGroupOut() +logo.get_n_splits( + features_cleaned_labels[feature_clean_columns], + features_cleaned_labels[label_column], + groups=features_cleaned_labels[group_column], +) + +# %% jupyter={"source_hidden": true} +features_clean_imputed = imputer.fit_transform(features_cleaned_labels[feature_clean_columns]) + +# %% jupyter={"source_hidden": true} +cross_val_score( + lin_reg_rapids_clean, + X=features_clean_imputed, + y=features_cleaned_labels[label_column], + groups=features_cleaned_labels[group_column], + cv=logo, + n_jobs=-1, + scoring="r2", +) + +# %% jupyter={"source_hidden": true} +lin_reg_full = linear_model.LinearRegression() +lin_reg_full.fit(features_clean_imputed,features_cleaned_labels[label_column]) + +# %% jupyter={"source_hidden": true} +NA_pred = lin_reg_full.predict(features_clean_imputed) + +# %% jupyter={"source_hidden": true} +# The coefficients +print("Coefficients: \n", lin_reg_full.coef_) +# The mean squared error +print("Mean squared error: %.2f" % mean_squared_error(features_cleaned_labels[label_column], NA_pred)) +# The coefficient of determination: 1 is perfect prediction +print("Coefficient of determination: %.2f" % r2_score(features_cleaned_labels[label_column], NA_pred)) + +# %% jupyter={"source_hidden": true} +feature_clean_columns[np.argmax(lin_reg_full.coef_)] + +# %% [markdown] +# Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. + +# %% jupyter={"source_hidden": true} +plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], features_cleaned_labels[label_column], color="black") +plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], NA_pred, color="red", linewidth=3) + +plt.xticks() +plt.yticks() + +fig = plt.gcf() +fig.set_size_inches(18.5, 10.5) + +plt.show() + +# %% [markdown] +# # RAPIDS models + +# %% [markdown] +# ## PANAS negative affect + +# %% jupyter={"source_hidden": true} +model_input = pd.read_csv("../rapids/data/processed/models/population_model/input_PANAS_NA.csv") + +# %% jupyter={"source_hidden": true} +model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) + +# %% jupyter={"source_hidden": true} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +#if "pid" in model_input.columns: +# index_columns.append("pid") +model_input.set_index(index_columns, inplace=True) + +data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] + +# %% jupyter={"source_hidden": true} +categorical_feature_colnames = ["gender", "startlanguage"] + +# %% jupyter={"source_hidden": true} +categorical_features = data_x[categorical_feature_colnames].copy() + +# %% jupyter={"source_hidden": true} +mode_categorical_features = categorical_features.mode().iloc[0] + +# %% jupyter={"source_hidden": true} +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# %% jupyter={"source_hidden": true} +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +# %% jupyter={"source_hidden": true} +numerical_features = data_x.drop(categorical_feature_colnames, axis=1) + +# %% jupyter={"source_hidden": true} +train_x = pd.concat([numerical_features, categorical_features], axis=1) + +# %% jupyter={"source_hidden": true} +train_x.dtypes + +# %% jupyter={"source_hidden": true} +logo = LeaveOneGroupOut() +logo.get_n_splits( + train_x, + data_y, + groups=data_groups, +) + +# %% jupyter={"source_hidden": true} +sum(data_y.isna()) + +# %% [markdown] +# ### Linear Regression + +# %% jupyter={"source_hidden": true} +lin_reg_rapids = linear_model.LinearRegression() + +# %% jupyter={"source_hidden": true} +imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + +# %% jupyter={"source_hidden": true} +lin_reg_scores = cross_val_score( + lin_reg_rapids, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +np.median(lin_reg_scores) + +# %% [markdown] +# ### Ridge regression + +# %% jupyter={"source_hidden": true} +ridge_reg = linear_model.Ridge(alpha=.5) + +# %% tags=[] jupyter={"source_hidden": true} +cross_val_score( + ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Lasso + +# %% jupyter={"source_hidden": true} +lasso_reg = linear_model.Lasso(alpha=0.1) + +# %% jupyter={"source_hidden": true} +cross_val_score( + lasso_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Bayesian Ridge + +# %% jupyter={"source_hidden": true} +bayesian_ridge_reg = linear_model.BayesianRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( + bayesian_ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### RANSAC (outlier robust regression) + +# %% jupyter={"source_hidden": true} +ransac_reg = linear_model.RANSACRegressor() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + ransac_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% [markdown] +# ### Support vector regression + +# %% jupyter={"source_hidden": true} +svr = svm.SVR() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + svr, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% jupyter={"source_hidden": true} +cross_val_score( +svr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Kernel Ridge regression + +# %% jupyter={"source_hidden": true} +kridge = kernel_ridge.KernelRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( +kridge, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Gaussian Process Regression + +# %% jupyter={"source_hidden": true} +gpr = gaussian_process.GaussianProcessRegressor() + +# %% jupyter={"source_hidden": true} +cross_val_score( +gpr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ## PANAS positive affect + +# %% jupyter={"source_hidden": true} +model_input = pd.read_csv("../rapids/data/processed/models/population_model/input_PANAS_PA.csv") + +# %% jupyter={"source_hidden": true} +model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) + +# %% jupyter={"source_hidden": true} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +#if "pid" in model_input.columns: +# index_columns.append("pid") +model_input.set_index(index_columns, inplace=True) + +data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] + +# %% jupyter={"source_hidden": true} +categorical_feature_colnames = ["gender", "startlanguage"] + +# %% jupyter={"source_hidden": true} +categorical_features = data_x[categorical_feature_colnames].copy() + +# %% jupyter={"source_hidden": true} +mode_categorical_features = categorical_features.mode().iloc[0] + +# %% jupyter={"source_hidden": true} +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# %% jupyter={"source_hidden": true} +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +# %% jupyter={"source_hidden": true} +numerical_features = data_x.drop(categorical_feature_colnames, axis=1) + +# %% jupyter={"source_hidden": true} +train_x = pd.concat([numerical_features, categorical_features], axis=1) + +# %% jupyter={"source_hidden": true} +train_x.dtypes + +# %% jupyter={"source_hidden": true} +logo = LeaveOneGroupOut() +logo.get_n_splits( + train_x, + data_y, + groups=data_groups, +) + +# %% jupyter={"source_hidden": true} +sum(data_y.isna()) + +# %% [markdown] +# ### Linear Regression + +# %% jupyter={"source_hidden": true} +lin_reg_rapids = linear_model.LinearRegression() + +# %% jupyter={"source_hidden": true} +imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + +# %% jupyter={"source_hidden": true} +lin_reg_scores = cross_val_score( + lin_reg_rapids, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +np.median(lin_reg_scores) + +# %% [markdown] +# ### Ridge regression + +# %% jupyter={"source_hidden": true} +ridge_reg = linear_model.Ridge(alpha=.5) + +# %% tags=[] jupyter={"source_hidden": true} +cross_val_score( + ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Lasso + +# %% jupyter={"source_hidden": true} +lasso_reg = linear_model.Lasso(alpha=0.1) + +# %% jupyter={"source_hidden": true} +cross_val_score( + lasso_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Bayesian Ridge + +# %% jupyter={"source_hidden": true} +bayesian_ridge_reg = linear_model.BayesianRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( + bayesian_ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### RANSAC (outlier robust regression) + +# %% jupyter={"source_hidden": true} +ransac_reg = linear_model.RANSACRegressor() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + ransac_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% [markdown] +# ### Support vector regression + +# %% jupyter={"source_hidden": true} +svr = svm.SVR() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + svr, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% jupyter={"source_hidden": true} +cross_val_score( +svr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Kernel Ridge regression + +# %% jupyter={"source_hidden": true} +kridge = kernel_ridge.KernelRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( +kridge, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Gaussian Process Regression + +# %% jupyter={"source_hidden": true} +gpr = gaussian_process.GaussianProcessRegressor() + +# %% jupyter={"source_hidden": true} +cross_val_score( +gpr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ## JCQ demand + +# %% jupyter={"source_hidden": true} +model_input = pd.read_csv("../rapids/data/processed/models/population_model/input_JCQ_demand.csv") + +# %% jupyter={"source_hidden": true} +model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) + +# %% jupyter={"source_hidden": true} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +#if "pid" in model_input.columns: +# index_columns.append("pid") +model_input.set_index(index_columns, inplace=True) + +data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] + +# %% jupyter={"source_hidden": true} +categorical_feature_colnames = ["gender", "startlanguage"] + +# %% jupyter={"source_hidden": true} +categorical_features = data_x[categorical_feature_colnames].copy() + +# %% jupyter={"source_hidden": true} +mode_categorical_features = categorical_features.mode().iloc[0] + +# %% jupyter={"source_hidden": true} +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# %% jupyter={"source_hidden": true} +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +# %% jupyter={"source_hidden": true} +numerical_features = data_x.drop(categorical_feature_colnames, axis=1) + +# %% jupyter={"source_hidden": true} +train_x = pd.concat([numerical_features, categorical_features], axis=1) + +# %% jupyter={"source_hidden": true} +train_x.dtypes + +# %% jupyter={"source_hidden": true} +logo = LeaveOneGroupOut() +logo.get_n_splits( + train_x, + data_y, + groups=data_groups, +) + +# %% jupyter={"source_hidden": true} +sum(data_y.isna()) + +# %% [markdown] +# ### Linear Regression + +# %% jupyter={"source_hidden": true} +lin_reg_rapids = linear_model.LinearRegression() + +# %% jupyter={"source_hidden": true} +imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + +# %% jupyter={"source_hidden": true} +lin_reg_scores = cross_val_score( + lin_reg_rapids, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +np.median(lin_reg_scores) + +# %% jupyter={"source_hidden": true} +lin_reg_scores + +# %% [markdown] +# ### Ridge regression + +# %% jupyter={"source_hidden": true} +ridge_reg = linear_model.Ridge(alpha=.5) + +# %% tags=[] jupyter={"source_hidden": true} +cross_val_score( + ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Lasso + +# %% jupyter={"source_hidden": true} +lasso_reg = linear_model.Lasso(alpha=0.1) + +# %% jupyter={"source_hidden": true} +cross_val_score( + lasso_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Bayesian Ridge + +# %% jupyter={"source_hidden": true} +bayesian_ridge_reg = linear_model.BayesianRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( + bayesian_ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### RANSAC (outlier robust regression) + +# %% jupyter={"source_hidden": true} +ransac_reg = linear_model.RANSACRegressor() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + ransac_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% [markdown] +# ### Support vector regression + +# %% jupyter={"source_hidden": true} +svr = svm.SVR() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + svr, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% jupyter={"source_hidden": true} +cross_val_score( +svr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Kernel Ridge regression + +# %% jupyter={"source_hidden": true} +kridge = kernel_ridge.KernelRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( +kridge, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Gaussian Process Regression + +# %% jupyter={"source_hidden": true} +gpr = gaussian_process.GaussianProcessRegressor() + +# %% jupyter={"source_hidden": true} +cross_val_score( +gpr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ## JCQ control + +# %% jupyter={"source_hidden": true} +model_input = pd.read_csv("../rapids/data/processed/models/population_model/input_JCQ_control.csv") + +# %% jupyter={"source_hidden": true} +model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) + +# %% jupyter={"source_hidden": true} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +#if "pid" in model_input.columns: +# index_columns.append("pid") +model_input.set_index(index_columns, inplace=True) + +data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] + +# %% jupyter={"source_hidden": true} +categorical_feature_colnames = ["gender", "startlanguage"] + +# %% jupyter={"source_hidden": true} +categorical_features = data_x[categorical_feature_colnames].copy() + +# %% jupyter={"source_hidden": true} +mode_categorical_features = categorical_features.mode().iloc[0] + +# %% jupyter={"source_hidden": true} +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# %% jupyter={"source_hidden": true} +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +# %% jupyter={"source_hidden": true} +numerical_features = data_x.drop(categorical_feature_colnames, axis=1) + +# %% jupyter={"source_hidden": true} +train_x = pd.concat([numerical_features, categorical_features], axis=1) + +# %% jupyter={"source_hidden": true} +train_x.dtypes + +# %% jupyter={"source_hidden": true} +logo = LeaveOneGroupOut() +logo.get_n_splits( + train_x, + data_y, + groups=data_groups, +) + +# %% jupyter={"source_hidden": true} +sum(data_y.isna()) + +# %% [markdown] +# ### Linear Regression + +# %% jupyter={"source_hidden": true} +lin_reg_rapids = linear_model.LinearRegression() + +# %% jupyter={"source_hidden": true} +imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + +# %% jupyter={"source_hidden": true} +lin_reg_scores = cross_val_score( + lin_reg_rapids, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +np.median(lin_reg_scores) + +# %% [markdown] +# ### Ridge regression + +# %% jupyter={"source_hidden": true} +ridge_reg = linear_model.Ridge(alpha=.5) + +# %% tags=[] jupyter={"source_hidden": true} +cross_val_score( + ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Lasso + +# %% jupyter={"source_hidden": true} +lasso_reg = linear_model.Lasso(alpha=0.1) + +# %% jupyter={"source_hidden": true} +cross_val_score( + lasso_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### Bayesian Ridge + +# %% jupyter={"source_hidden": true} +bayesian_ridge_reg = linear_model.BayesianRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( + bayesian_ridge_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) + +# %% [markdown] +# ### RANSAC (outlier robust regression) + +# %% jupyter={"source_hidden": true} +ransac_reg = linear_model.RANSACRegressor() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + ransac_reg, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% [markdown] +# ### Support vector regression + +# %% jupyter={"source_hidden": true} +svr = svm.SVR() + +# %% jupyter={"source_hidden": true} +np.median( + cross_val_score( + svr, + X=imputer.fit_transform(train_x), + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" +) +) + +# %% jupyter={"source_hidden": true} +cross_val_score( +svr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] tags=[] +# ### Kernel Ridge regression + +# %% jupyter={"source_hidden": true} +kridge = kernel_ridge.KernelRidge() + +# %% jupyter={"source_hidden": true} +cross_val_score( +kridge, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% [markdown] +# ### Gaussian Process Regression + +# %% jupyter={"source_hidden": true} +gpr = gaussian_process.GaussianProcessRegressor() + +# %% jupyter={"source_hidden": true} tags=[] +cross_val_score( +gpr, +X=imputer.fit_transform(train_x), +y=data_y, +groups=data_groups, +cv=logo, +n_jobs=-1, +scoring="r2" +) + +# %% jupyter={"source_hidden": true} diff --git a/exploration/expl_esm_labels.py b/exploration/expl_esm_labels.py index 487b5e1..a0e706c 100644 --- a/exploration/expl_esm_labels.py +++ b/exploration/expl_esm_labels.py @@ -7,7 +7,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.2 +# jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python @@ -17,6 +17,7 @@ # %% import os import sys +import datetime import seaborn as sns @@ -26,6 +27,7 @@ if nb_dir not in sys.path: import participants.query_db from features.esm import * from features.esm_JCQ import * +from features.esm_SAM import * # %% participants_inactive_usernames = participants.query_db.get_usernames( @@ -99,6 +101,12 @@ df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1] # %% [markdown] # # Stress appraisal measure +# %% +df_SAM_all = extract_stressful_events(df_esm_inactive) + +# %% +df_SAM_all.head() + # %% df_esm_SAM = df_esm_preprocessed[ (df_esm_preprocessed["questionnaire_id"] >= 87) diff --git a/statistical_analysis/adherence.py b/statistical_analysis/adherence.py index 2589fd8..24a4ccb 100644 --- a/statistical_analysis/adherence.py +++ b/statistical_analysis/adherence.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.12.0 +# jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python @@ -14,25 +14,7 @@ # --- # %% -# %matplotlib inline -import datetime -import os -import sys - -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -import statsmodels.api as sm -import statsmodels.formula.api as smf - -nb_dir = os.path.split(os.getcwd())[0] -if nb_dir not in sys.path: - sys.path.append(nb_dir) -import participants.query_db -from features.esm import * - -# %% -SAVE_FIGS = True +SAVE_FIGS = False FIG_HEIGHT = 5 FIG_ASPECT = 1.7 FIG_COLOUR = "#28827C"