# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% # %matplotlib inline import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml from pyprojroot import here from sklearn import linear_model from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.features_sensor import machine_learning.labels import machine_learning.model # %% import participants.query_db from features import esm, helper, proximity # %% [markdown] tags=[] # # 1. Get the relevant data # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) # Consider only two participants to simplify. ptcp_2 = participants_inactive_usernames[0:2] # %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[] # ## 1.1 Labels # %% df_esm = esm.get_esm_data(ptcp_2) df_esm_preprocessed = esm.preprocess_esm(df_esm) # %% df_esm_PANAS = df_esm_preprocessed[ (df_esm_preprocessed["questionnaire_id"] == 8) | (df_esm_preprocessed["questionnaire_id"] == 9) ] df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS) # %% [markdown] # ## 1.2 Sensor data # %% df_proximity = proximity.get_proximity_data(ptcp_2) df_proximity = helper.get_date_from_timestamp(df_proximity) df_proximity = proximity.recode_proximity(df_proximity) # %% [markdown] # ## 1.3 Standardization/personalization # %% [markdown] # # 2. Grouping/segmentation # %% df_esm_PANAS_daily_means = ( df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"]) .esm_user_answer_numeric.agg("mean") .reset_index() .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) ) # %% df_esm_PANAS_daily_means = ( df_esm_PANAS_daily_means.pivot( index=["participant_id", "date_lj"], columns="questionnaire_id", values="esm_numeric_mean", ) .reset_index(col_level=1) .rename(columns={8.0: "PA", 9.0: "NA"}) .set_index(["participant_id", "date_lj"]) ) # %% df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"]) # %% df_proximity_daily_counts # %% [markdown] # # 3. Join features (and export to csv?) # %% df_full_data_daily_means = df_esm_PANAS_daily_means.join( df_proximity_daily_counts ).reset_index() # %% [markdown] # # 4. Machine learning model and parameters # %% lin_reg_proximity = linear_model.LinearRegression() # %% [markdown] # ## 4.1 Validation method # %% logo = LeaveOneGroupOut() logo.get_n_splits( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], groups=df_full_data_daily_means["participant_id"], ) # %% [markdown] # ## 4.2 Fit results (export?) # %% cross_val_score( lin_reg_proximity, df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], groups=df_full_data_daily_means["participant_id"], cv=logo, n_jobs=-1, scoring="r2", ) # %% lin_reg_proximity.fit( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], ) # %% lin_reg_proximity.score( df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]], df_full_data_daily_means["PA"], ) # %% [markdown] # # Merging these into a pipeline # %% from machine_learning import features_sensor, labels, model, pipeline # %% importlib.reload(features_sensor) # %% with open("../machine_learning/config/minimal_features.yaml", "r") as file: sensor_features_params = yaml.safe_load(file) print(sensor_features_params) # %% sensor_features = machine_learning.features_sensor.SensorFeatures( **sensor_features_params ) sensor_features.data_types # %% sensor_features.set_participants_label("nokia_0000003") # %% sensor_features.data_types = ["proximity", "communication"] sensor_features.participants_usernames = ptcp_2 # %% sensor_features.get_sensor_data("proximity") # %% sensor_features.set_sensor_data() # %% sensor_features.get_sensor_data("proximity") # %% sensor_features.calculate_features(cached=False) features_all_calculated = sensor_features.get_features("all", "all") # %% sensor_features.calculate_features(cached=True) features_all_read = sensor_features.get_features("all", "all") # %% features_all_read = features_all_read.reset_index() features_all_read["date_lj"] = features_all_read["date_lj"].dt.date features_all_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. # %% np.isclose(features_all_read, features_all_calculated).all() # %% with open("../machine_learning/config/minimal_labels.yaml", "r") as file: labels_params = yaml.safe_load(file) # %% labels = machine_learning.labels.Labels(**labels_params) labels.participants_usernames = ptcp_2 labels.set_participants_label("nokia_0000003") labels.questionnaires # %% labels.set_labels() # %% labels.get_labels("PANAS") # %% labels.aggregate_labels(cached=False) labels_calculated = labels.get_aggregated_labels() # %% labels.aggregate_labels(cached=True) labels_read = labels.get_aggregated_labels() labels_read = labels_read.reset_index() labels_read["date_lj"] = labels_read["date_lj"].dt.date labels_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. # %% np.isclose(labels_read, labels_calculated).all() # %% model_validation = machine_learning.model.ModelValidation( sensor_features.get_features("all", "all"), labels.get_aggregated_labels(), group_variable="participant_id", cv_name="loso", ) model_validation.model = linear_model.LinearRegression() model_validation.set_cv_method() # %% model_validation.cross_validate() # %% model_validation.groups # %% [markdown] # # Use RAPIDS # %% with open(here("rapids/config.yaml"), "r") as file: rapids_config = yaml.safe_load(file) # %% for key in rapids_config.keys(): if isinstance(rapids_config[key], dict): # Remove top-level configs if ("PROVIDERS" in rapids_config[key]): # Retain features (that have providers) if rapids_config[key]["PROVIDERS"]: # Remove non-implemented features for provider in rapids_config[key]["PROVIDERS"]: if rapids_config[key]["PROVIDERS"][provider]["COMPUTE"]: # Check that the features were actually calculated if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]: print(key) print(provider) print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"]) # %% features_rapids = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) # %% features_rapids.columns # %% features_rapids = features_rapids.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date) # %% features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)") features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"]) features_rapids.set_index(["participant_id", "date_lj"], inplace=True) # %% with open("../machine_learning/config/minimal_labels.yaml", "r") as file: labels_params = yaml.safe_load(file) # %% labels = machine_learning.labels.Labels(**labels_params) labels.set_participants_label("all") # %% labels.aggregate_labels(cached=True) labels_read = labels.get_aggregated_labels() labels_read = labels_read.reset_index() labels_read["date_lj"] = labels_read["date_lj"].dt.date labels_read.set_index(["participant_id", "date_lj"], inplace=True) # date_lj column is parsed as a date and represented as Timestamp, when read from csv. # When calculated, it is represented as date. # %% features_rapids.shape # %% labels_read.shape # %% features_labels = features_rapids.join(labels_read, how="inner").reset_index() # %% features_labels.shape # %% features_labels.columns # %% imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # %% feature_columns = features_labels.columns[6:-3] label_column = "NA" group_column = "pid" # %% lin_reg_rapids = linear_model.LinearRegression() logo = LeaveOneGroupOut() logo.get_n_splits( features_labels[feature_columns], features_labels[label_column], groups=features_labels[group_column], ) # %% cross_val_score( lin_reg_rapids, X=imputer.fit_transform(features_labels[feature_columns]), y=features_labels[label_column], groups=features_labels[group_column], cv=logo, n_jobs=-1, scoring="r2", ) # %% sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(features_labels[feature_columns].isna(), cbar=False) # %% [markdown] tags=[] # ```yaml # ALL_CLEANING_INDIVIDUAL: # PROVIDERS: # RAPIDS: # COMPUTE: True # IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below # COMPUTE: True # MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero. # COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable # COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance # ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable # DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES # DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable # DROP_HIGHLY_CORRELATED_FEATURES: # COMPUTE: False # MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 # CORR_THRESHOLD: 0.95 # SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R # ``` # %% features_rapids_cleaned = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) features_rapids_cleaned = features_rapids_cleaned.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date) features_rapids_cleaned["participant_id"] = features_rapids_cleaned["pid"].str.extract("(\d+)") features_rapids_cleaned["participant_id"] = pd.to_numeric(features_rapids_cleaned["participant_id"]) features_rapids_cleaned.set_index(["participant_id", "date_lj"], inplace=True) # %% features_cleaned_labels = features_rapids_cleaned.join(labels_read, how="inner").reset_index() feature_clean_columns = features_cleaned_labels.columns[6:-3] # %% print(feature_columns.shape) print(feature_clean_columns.shape) # %% sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(features_cleaned_labels[feature_clean_columns].isna(), cbar=False) # %% lin_reg_rapids_clean = linear_model.LinearRegression() logo = LeaveOneGroupOut() logo.get_n_splits( features_cleaned_labels[feature_clean_columns], features_cleaned_labels[label_column], groups=features_cleaned_labels[group_column], ) # %% features_clean_imputed = imputer.fit_transform(features_cleaned_labels[feature_clean_columns]) # %% cross_val_score( lin_reg_rapids_clean, X=features_clean_imputed, y=features_cleaned_labels[label_column], groups=features_cleaned_labels[group_column], cv=logo, n_jobs=-1, scoring="r2", ) # %% lin_reg_full = linear_model.LinearRegression() lin_reg_full.fit(features_clean_imputed,features_cleaned_labels[label_column]) # %% NA_pred = lin_reg_full.predict(features_clean_imputed) # %% # The coefficients print("Coefficients: \n", lin_reg_full.coef_) # The mean squared error print("Mean squared error: %.2f" % mean_squared_error(features_cleaned_labels[label_column], NA_pred)) # The coefficient of determination: 1 is perfect prediction print("Coefficient of determination: %.2f" % r2_score(features_cleaned_labels[label_column], NA_pred)) # %% feature_clean_columns[np.argmax(lin_reg_full.coef_)] # %% [markdown] # Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. # %% plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], features_cleaned_labels[label_column], color="black") plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], NA_pred, color="red", linewidth=3) plt.xticks() plt.yticks() fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show()