# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% # %matplotlib inline import datetime import importlib import os import sys import numpy as np import pandas as pd import seaborn as sns import yaml from pyprojroot import here from sklearn import linear_model from sklearn.model_selection import LeaveOneGroupOut, cross_val_score nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.features_sensor import machine_learning.labels import machine_learning.model # %% import participants.query_db from features import esm, helper, proximity # %% [markdown] tags=[] # # 1. Get the relevant data # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) # Consider only two participants to simplify. ptcp_2 = participants_inactive_usernames[0:2] # %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[] # ## 1.1 Labels # %% df_esm = esm.get_esm_data(ptcp_2) df_esm_preprocessed = esm.preprocess_esm(df_esm) # %% df_esm_PANAS = df_esm_preprocessed[ (df_esm_preprocessed["questionnaire_id"] == 8) | (df_esm_preprocessed["questionnaire_id"] == 9) ] df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS) # %% [markdown] # ## 1.2 Sensor data # %% df_proximity = proximity.get_proximity_data(ptcp_2) df_proximity = helper.get_date_from_timestamp(df_proximity) df_proximity = proximity.recode_proximity(df_proximity) # %% [markdown] # ## 1.3 Standardization/personalization # %% [markdown] # # 2. Grouping/segmentation # %% df_esm_PANAS_daily_means = ( df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"]) .esm_user_answer_numeric.agg("mean") .reset_index() .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) ) # %% df_esm_PANAS_daily_means = ( df_esm_PANAS_daily_means.pivot( index=["participant_id", "date_lj"], columns="questionnaire_id", values="esm_numeric_mean", ) .reset_index(col_level=1) .rename(columns={8.0: "PA", 9.0: "NA"}) .set_index(["participant_id", "date_lj"]) ) # %% df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"]) # %% df_proximity_daily_counts # %% [markdown] # # 3. Join features (and export to csv?) # %% df_full_data_daily_means = df_esm_PANAS_daily_means.join( df_proximity_daily_counts ).reset_index() # %% [markdown] # # 4. Machine learning model and parameters # %% lin_reg_proximity = linear_model.LinearRegression() # %% [markdown] # ## 4.1 Validation method # %% logo = LeaveOneGroupOut() logo.get_n_splits( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], groups=df_full_data_daily_means["participant_id"], ) # %% [markdown] # ## 4.2 Fit results (export?) # %% cross_val_score( lin_reg_proximity, df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], groups=df_full_data_daily_means["participant_id"], cv=logo, n_jobs=-1, scoring="r2", ) # %% lin_reg_proximity.fit( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], ) # %% lin_reg_proximity.score( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], ) # %% [markdown] # # Merging these into a pipeline # %% from machine_learning import features_sensor, labels, model, pipeline # %% importlib.reload(features_sensor) # %% with open("../machine_learning/config/minimal_features.yaml", "r") as file: sensor_features_params = yaml.safe_load(file) print(sensor_features_params) # %% sensor_features = machine_learning.features_sensor.SensorFeatures( **sensor_features_params ) sensor_features.data_types # %% sensor_features.set_participants_label("nokia_0000003") # %% sensor_features.data_types = ["proximity", "communication"] sensor_features.participants_usernames = ptcp_2 # %% sensor_features.get_sensor_data("proximity") # %% sensor_features.set_sensor_data() # %% sensor_features.get_sensor_data("proximity") # %% sensor_features.calculate_features(cached=False) features_all_calculated = sensor_features.get_features("all", "all") # %% sensor_features.calculate_features(cached=True) features_all_read = sensor_features.get_features("all", "all") # %% features_all_read = features_all_read.reset_index() features_all_read["date_lj"] = features_all_read["date_lj"].dt.date features_all_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. # %% np.isclose(features_all_read, features_all_calculated).all() # %% with open("../machine_learning/config/minimal_labels.yaml", "r") as file: labels_params = yaml.safe_load(file) # %% labels = machine_learning.labels.Labels(**labels_params) labels.participants_usernames = ptcp_2 labels.set_participants_label("nokia_0000003") labels.questionnaires # %% labels.set_labels() # %% labels.get_labels("PANAS") # %% labels.aggregate_labels(cached=False) labels_calculated = labels.get_aggregated_labels() # %% labels.aggregate_labels(cached=True) labels_read = labels.get_aggregated_labels() labels_read = labels_read.reset_index() labels_read["date_lj"] = labels_read["date_lj"].dt.date labels_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. # %% np.isclose(labels_read, labels_calculated).all() # %% model_validation = machine_learning.model.ModelValidation( sensor_features.get_features("all", "all"), labels.get_aggregated_labels(), group_variable="participant_id", cv_name="loso", ) model_validation.model = linear_model.LinearRegression() model_validation.set_cv_method() # %% model_validation.cross_validate() # %% model_validation.groups # %% [markdown] # # Use RAPIDS # %% with open(here("rapids/config.yaml"), "r") as file: rapids_config = yaml.safe_load(file) # %% for key in rapids_config.keys(): if isinstance(rapids_config[key], dict): # Remove top-level configs if "PROVIDERS" in rapids_config[key]: # Retain features (that have providers) if rapids_config[key]["PROVIDERS"]: # Remove non-implemented features for provider in rapids_config[key]["PROVIDERS"]: if rapids_config[key]["PROVIDERS"][provider][ "COMPUTE" ]: # Check that the features were actually calculated if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]: print(key) print(provider) print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"]) # %% features_rapids = pd.read_csv( here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"], ) # %% features_rapids.columns # %% features_rapids = features_rapids.assign( date_lj=lambda x: x.local_segment_start_datetime.dt.date ) # %% features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)") features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"])