2021-08-11 16:42:30 +02:00
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
2022-01-07 17:00:12 +01:00
# jupytext_version: 1.13.0
2021-08-11 16:42:30 +02:00
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import datetime
2021-08-20 19:17:22 +02:00
import importlib
2021-08-11 16:42:30 +02:00
import os
import sys
2021-09-14 17:42:34 +02:00
import numpy as np
2022-01-19 13:41:09 +01:00
import matplotlib . pyplot as plt
2021-09-14 17:42:34 +02:00
import pandas as pd
2021-08-11 16:42:30 +02:00
import seaborn as sns
2021-08-19 17:05:44 +02:00
import yaml
2022-01-07 17:00:12 +01:00
from pyprojroot import here
2021-08-12 16:54:00 +02:00
from sklearn import linear_model
from sklearn . model_selection import LeaveOneGroupOut , cross_val_score
2022-01-19 13:41:09 +01:00
from sklearn . metrics import mean_squared_error , r2_score
2022-01-19 12:53:03 +01:00
from sklearn . impute import SimpleImputer
2021-08-11 16:42:30 +02:00
nb_dir = os . path . split ( os . getcwd ( ) ) [ 0 ]
if nb_dir not in sys . path :
sys . path . append ( nb_dir )
2021-09-13 11:41:57 +02:00
import machine_learning . features_sensor
import machine_learning . labels
import machine_learning . model
2021-08-11 16:42:30 +02:00
2021-09-13 17:43:47 +02:00
# %%
import participants . query_db
from features import esm , helper , proximity
2021-09-14 17:42:34 +02:00
# %% [markdown] tags=[]
2021-08-11 16:42:30 +02:00
# # 1. Get the relevant data
# %%
participants_inactive_usernames = participants . query_db . get_usernames (
collection_start = datetime . date . fromisoformat ( " 2020-08-01 " )
)
# Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames [ 0 : 2 ]
2021-09-14 17:42:34 +02:00
# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
2021-08-11 16:42:30 +02:00
# ## 1.1 Labels
# %%
df_esm = esm . get_esm_data ( ptcp_2 )
df_esm_preprocessed = esm . preprocess_esm ( df_esm )
# %%
df_esm_PANAS = df_esm_preprocessed [
( df_esm_preprocessed [ " questionnaire_id " ] == 8 )
| ( df_esm_preprocessed [ " questionnaire_id " ] == 9 )
]
df_esm_PANAS_clean = esm . clean_up_esm ( df_esm_PANAS )
# %% [markdown]
# ## 1.2 Sensor data
# %%
df_proximity = proximity . get_proximity_data ( ptcp_2 )
2021-08-11 17:26:44 +02:00
df_proximity = helper . get_date_from_timestamp ( df_proximity )
2021-08-11 16:42:30 +02:00
df_proximity = proximity . recode_proximity ( df_proximity )
2021-08-12 15:07:20 +02:00
# %% [markdown]
# ## 1.3 Standardization/personalization
2021-08-11 16:42:30 +02:00
# %% [markdown]
# # 2. Grouping/segmentation
# %%
df_esm_PANAS_daily_means = (
df_esm_PANAS_clean . groupby ( [ " participant_id " , " date_lj " , " questionnaire_id " ] )
. esm_user_answer_numeric . agg ( " mean " )
. reset_index ( )
. rename ( columns = { " esm_user_answer_numeric " : " esm_numeric_mean " } )
)
2021-08-12 15:07:20 +02:00
2021-08-12 16:54:00 +02:00
# %%
df_esm_PANAS_daily_means = (
df_esm_PANAS_daily_means . pivot (
index = [ " participant_id " , " date_lj " ] ,
columns = " questionnaire_id " ,
values = " esm_numeric_mean " ,
)
. reset_index ( col_level = 1 )
. rename ( columns = { 8.0 : " PA " , 9.0 : " NA " } )
. set_index ( [ " participant_id " , " date_lj " ] )
)
2021-08-12 15:07:20 +02:00
# %%
2021-11-17 10:44:49 +01:00
df_proximity_daily_counts = proximity . count_proximity ( df_proximity , [ " date_lj " ] )
2021-08-12 15:07:20 +02:00
# %%
df_proximity_daily_counts
# %% [markdown]
# # 3. Join features (and export to csv?)
2021-08-12 16:54:00 +02:00
# %%
df_full_data_daily_means = df_esm_PANAS_daily_means . join (
df_proximity_daily_counts
) . reset_index ( )
2021-08-12 15:07:20 +02:00
# %% [markdown]
# # 4. Machine learning model and parameters
# %%
2021-08-12 16:54:00 +02:00
lin_reg_proximity = linear_model . LinearRegression ( )
# %% [markdown]
# ## 4.1 Validation method
# %%
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
groups = df_full_data_daily_means [ " participant_id " ] ,
)
# %% [markdown]
# ## 4.2 Fit results (export?)
# %%
cross_val_score (
lin_reg_proximity ,
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
groups = df_full_data_daily_means [ " participant_id " ] ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 " ,
)
# %%
lin_reg_proximity . fit (
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
)
# %%
lin_reg_proximity . score (
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
)
2021-08-12 17:38:08 +02:00
# %% [markdown]
# # Merging these into a pipeline
# %%
2021-09-14 17:42:34 +02:00
from machine_learning import features_sensor , labels , model , pipeline
2021-08-12 17:38:08 +02:00
2021-08-20 19:17:22 +02:00
# %%
2021-09-14 17:42:34 +02:00
importlib . reload ( features_sensor )
2021-08-20 19:17:22 +02:00
2021-08-12 17:38:08 +02:00
# %%
2021-08-19 17:05:44 +02:00
with open ( " ../machine_learning/config/minimal_features.yaml " , " r " ) as file :
2021-08-19 17:23:23 +02:00
sensor_features_params = yaml . safe_load ( file )
2021-08-20 17:59:00 +02:00
print ( sensor_features_params )
2021-08-19 17:23:23 +02:00
# %%
2021-09-13 17:43:47 +02:00
sensor_features = machine_learning . features_sensor . SensorFeatures (
* * sensor_features_params
)
2021-08-19 17:23:23 +02:00
sensor_features . data_types
2021-08-12 19:06:43 +02:00
2021-08-23 16:36:26 +02:00
# %%
sensor_features . set_participants_label ( " nokia_0000003 " )
2021-08-20 17:59:00 +02:00
# %%
sensor_features . data_types = [ " proximity " , " communication " ]
2021-08-20 19:44:50 +02:00
sensor_features . participants_usernames = ptcp_2
2021-08-20 17:59:00 +02:00
2021-08-19 17:05:44 +02:00
# %%
sensor_features . get_sensor_data ( " proximity " )
2021-08-12 19:06:43 +02:00
# %%
2021-08-19 16:31:42 +02:00
sensor_features . set_sensor_data ( )
2021-08-12 19:06:43 +02:00
# %%
2021-08-19 16:31:42 +02:00
sensor_features . get_sensor_data ( " proximity " )
2021-08-12 17:38:08 +02:00
# %%
2021-09-14 17:42:34 +02:00
sensor_features . calculate_features ( cached = False )
features_all_calculated = sensor_features . get_features ( " all " , " all " )
# %%
sensor_features . calculate_features ( cached = True )
features_all_read = sensor_features . get_features ( " all " , " all " )
# %%
features_all_read = features_all_read . reset_index ( )
features_all_read [ " date_lj " ] = features_all_read [ " date_lj " ] . dt . date
features_all_read . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
2021-08-12 17:38:08 +02:00
2021-08-20 17:59:00 +02:00
# %%
2021-09-14 17:42:34 +02:00
np . isclose ( features_all_read , features_all_calculated ) . all ( )
2021-08-20 17:59:00 +02:00
2021-08-19 17:23:23 +02:00
# %%
2021-08-19 17:44:04 +02:00
with open ( " ../machine_learning/config/minimal_labels.yaml " , " r " ) as file :
labels_params = yaml . safe_load ( file )
# %%
2021-09-13 11:41:57 +02:00
labels = machine_learning . labels . Labels ( * * labels_params )
2021-08-20 19:44:50 +02:00
labels . participants_usernames = ptcp_2
2021-09-15 15:36:36 +02:00
labels . set_participants_label ( " nokia_0000003 " )
2021-08-19 17:44:04 +02:00
labels . questionnaires
# %%
labels . set_labels ( )
# %%
labels . get_labels ( " PANAS " )
# %%
2021-09-15 15:45:49 +02:00
labels . aggregate_labels ( cached = False )
labels_calculated = labels . get_aggregated_labels ( )
2021-08-20 19:17:22 +02:00
# %%
2021-09-15 15:45:49 +02:00
labels . aggregate_labels ( cached = True )
labels_read = labels . get_aggregated_labels ( )
labels_read = labels_read . reset_index ( )
labels_read [ " date_lj " ] = labels_read [ " date_lj " ] . dt . date
labels_read . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
# %%
np . isclose ( labels_read , labels_calculated ) . all ( )
2021-08-20 19:17:22 +02:00
# %%
2021-09-13 11:41:57 +02:00
model_validation = machine_learning . model . ModelValidation (
2021-08-20 19:44:50 +02:00
sensor_features . get_features ( " all " , " all " ) ,
labels . get_aggregated_labels ( ) ,
group_variable = " participant_id " ,
cv_name = " loso " ,
)
model_validation . model = linear_model . LinearRegression ( )
model_validation . set_cv_method ( )
# %%
model_validation . cross_validate ( )
# %%
model_validation . groups
2022-01-07 17:00:12 +01:00
# %% [markdown]
# # Use RAPIDS
# %%
with open ( here ( " rapids/config.yaml " ) , " r " ) as file :
rapids_config = yaml . safe_load ( file )
# %%
2022-01-19 12:53:03 +01:00
for key in rapids_config . keys ( ) :
2022-01-07 17:00:12 +01:00
if isinstance ( rapids_config [ key ] , dict ) : # Remove top-level configs
2022-01-19 12:53:03 +01:00
if ( " PROVIDERS " in rapids_config [ key ] ) : # Retain features (that have providers)
2022-01-07 17:00:12 +01:00
if rapids_config [ key ] [ " PROVIDERS " ] : # Remove non-implemented features
for provider in rapids_config [ key ] [ " PROVIDERS " ] :
2022-01-19 12:53:03 +01:00
if rapids_config [ key ] [ " PROVIDERS " ] [ provider ] [ " COMPUTE " ] : # Check that the features were actually calculated
2022-01-07 17:00:12 +01:00
if " FEATURES " in rapids_config [ key ] [ " PROVIDERS " ] [ provider ] :
print ( key )
print ( provider )
print ( rapids_config [ key ] [ " PROVIDERS " ] [ provider ] [ " FEATURES " ] )
# %%
2022-01-19 12:53:03 +01:00
features_rapids = pd . read_csv ( here ( " rapids/data/processed/features/all_participants/all_sensor_features.csv " ) , parse_dates = [ " local_segment_start_datetime " , " local_segment_end_datetime " ] )
2022-01-07 17:00:12 +01:00
# %%
features_rapids . columns
# %%
2022-01-19 12:53:03 +01:00
features_rapids = features_rapids . assign ( date_lj = lambda x : x . local_segment_start_datetime . dt . date )
2022-01-07 17:00:12 +01:00
2021-08-20 19:44:50 +02:00
# %%
2022-01-07 17:00:12 +01:00
features_rapids [ " participant_id " ] = features_rapids [ " pid " ] . str . extract ( " ( \ d+) " )
features_rapids [ " participant_id " ] = pd . to_numeric ( features_rapids [ " participant_id " ] )
2022-01-19 12:53:03 +01:00
features_rapids . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# %%
with open ( " ../machine_learning/config/minimal_labels.yaml " , " r " ) as file :
labels_params = yaml . safe_load ( file )
# %%
labels = machine_learning . labels . Labels ( * * labels_params )
labels . set_participants_label ( " all " )
# %%
labels . aggregate_labels ( cached = True )
labels_read = labels . get_aggregated_labels ( )
labels_read = labels_read . reset_index ( )
labels_read [ " date_lj " ] = labels_read [ " date_lj " ] . dt . date
labels_read . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
# %%
features_rapids . shape
# %%
labels_read . shape
# %%
features_labels = features_rapids . join ( labels_read , how = " inner " ) . reset_index ( )
# %%
features_labels . shape
# %%
features_labels . columns
# %%
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' mean ' )
# %%
feature_columns = features_labels . columns [ 6 : - 3 ]
label_column = " NA "
group_column = " pid "
# %%
lin_reg_rapids = linear_model . LinearRegression ( )
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
features_labels [ feature_columns ] ,
features_labels [ label_column ] ,
groups = features_labels [ group_column ] ,
)
# %%
cross_val_score (
lin_reg_rapids ,
X = imputer . fit_transform ( features_labels [ feature_columns ] ) ,
y = features_labels [ label_column ] ,
groups = features_labels [ group_column ] ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 " ,
)
# %%
sns . set ( rc = { " figure.figsize " : ( 16 , 8 ) } )
sns . heatmap ( features_labels [ feature_columns ] . isna ( ) , cbar = False )
# %% [markdown] tags=[]
# ```yaml
# ALL_CLEANING_INDIVIDUAL:
# PROVIDERS:
# RAPIDS:
# COMPUTE: True
# IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below
# COMPUTE: True
# MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero.
# COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable
# COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance
# ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable
# DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
# DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable
# DROP_HIGHLY_CORRELATED_FEATURES:
# COMPUTE: False
# MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
# CORR_THRESHOLD: 0.95
# SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
# ```
# %%
2022-01-19 13:41:09 +01:00
features_rapids_cleaned = pd . read_csv ( here ( " rapids/data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv " ) , parse_dates = [ " local_segment_start_datetime " , " local_segment_end_datetime " ] )
features_rapids_cleaned = features_rapids_cleaned . assign ( date_lj = lambda x : x . local_segment_start_datetime . dt . date )
features_rapids_cleaned [ " participant_id " ] = features_rapids_cleaned [ " pid " ] . str . extract ( " ( \ d+) " )
features_rapids_cleaned [ " participant_id " ] = pd . to_numeric ( features_rapids_cleaned [ " participant_id " ] )
features_rapids_cleaned . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# %%
features_cleaned_labels = features_rapids_cleaned . join ( labels_read , how = " inner " ) . reset_index ( )
feature_clean_columns = features_cleaned_labels . columns [ 6 : - 3 ]
# %%
print ( feature_columns . shape )
print ( feature_clean_columns . shape )
# %%
sns . set ( rc = { " figure.figsize " : ( 16 , 8 ) } )
sns . heatmap ( features_cleaned_labels [ feature_clean_columns ] . isna ( ) , cbar = False )
# %%
lin_reg_rapids_clean = linear_model . LinearRegression ( )
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
features_cleaned_labels [ feature_clean_columns ] ,
features_cleaned_labels [ label_column ] ,
groups = features_cleaned_labels [ group_column ] ,
)
# %%
features_clean_imputed = imputer . fit_transform ( features_cleaned_labels [ feature_clean_columns ] )
# %%
cross_val_score (
lin_reg_rapids_clean ,
X = features_clean_imputed ,
y = features_cleaned_labels [ label_column ] ,
groups = features_cleaned_labels [ group_column ] ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 " ,
)
# %%
lin_reg_full = linear_model . LinearRegression ( )
lin_reg_full . fit ( features_clean_imputed , features_cleaned_labels [ label_column ] )
# %%
NA_pred = lin_reg_full . predict ( features_clean_imputed )
# %%
# The coefficients
print ( " Coefficients: \n " , lin_reg_full . coef_ )
# The mean squared error
print ( " Mean squared error: %.2f " % mean_squared_error ( features_cleaned_labels [ label_column ] , NA_pred ) )
# The coefficient of determination: 1 is perfect prediction
print ( " Coefficient of determination: %.2f " % r2_score ( features_cleaned_labels [ label_column ] , NA_pred ) )
# %%
feature_clean_columns [ np . argmax ( lin_reg_full . coef_ ) ]
# %% [markdown]
# Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine.
# %%
plt . scatter ( features_clean_imputed [ : , np . argmax ( lin_reg_full . coef_ ) ] , features_cleaned_labels [ label_column ] , color = " black " )
plt . scatter ( features_clean_imputed [ : , np . argmax ( lin_reg_full . coef_ ) ] , NA_pred , color = " red " , linewidth = 3 )
plt . xticks ( )
plt . yticks ( )
fig = plt . gcf ( )
fig . set_size_inches ( 18.5 , 10.5 )
plt . show ( )