2021-08-11 16:42:30 +02:00
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
2022-08-23 16:41:41 +02:00
# jupytext_version: 1.13.0
2021-08-11 16:42:30 +02:00
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-11 16:42:30 +02:00
# %matplotlib inline
import datetime
2021-08-20 19:17:22 +02:00
import importlib
2021-08-11 16:42:30 +02:00
import os
import sys
2021-09-14 17:42:34 +02:00
import numpy as np
2022-08-23 16:41:41 +02:00
import matplotlib . pyplot as plt
2021-09-14 17:42:34 +02:00
import pandas as pd
2021-08-11 16:42:30 +02:00
import seaborn as sns
2021-08-19 17:05:44 +02:00
import yaml
2022-08-23 16:41:41 +02:00
from pyprojroot import here
from sklearn import linear_model , svm , kernel_ridge , gaussian_process
2021-08-12 16:54:00 +02:00
from sklearn . model_selection import LeaveOneGroupOut , cross_val_score
2022-08-23 16:41:41 +02:00
from sklearn . metrics import mean_squared_error , r2_score
from sklearn . impute import SimpleImputer
2021-08-11 16:42:30 +02:00
nb_dir = os . path . split ( os . getcwd ( ) ) [ 0 ]
if nb_dir not in sys . path :
sys . path . append ( nb_dir )
2021-09-13 11:41:57 +02:00
import machine_learning . features_sensor
import machine_learning . labels
import machine_learning . model
2021-08-11 16:42:30 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-13 17:43:47 +02:00
import participants . query_db
from features import esm , helper , proximity
2021-09-14 17:42:34 +02:00
# %% [markdown] tags=[]
2021-08-11 16:42:30 +02:00
# # 1. Get the relevant data
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-11 16:42:30 +02:00
participants_inactive_usernames = participants . query_db . get_usernames (
collection_start = datetime . date . fromisoformat ( " 2020-08-01 " )
)
# Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames [ 0 : 2 ]
2021-09-14 17:42:34 +02:00
# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
2021-08-11 16:42:30 +02:00
# ## 1.1 Labels
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-11 16:42:30 +02:00
df_esm = esm . get_esm_data ( ptcp_2 )
df_esm_preprocessed = esm . preprocess_esm ( df_esm )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-11 16:42:30 +02:00
df_esm_PANAS = df_esm_preprocessed [
( df_esm_preprocessed [ " questionnaire_id " ] == 8 )
| ( df_esm_preprocessed [ " questionnaire_id " ] == 9 )
]
df_esm_PANAS_clean = esm . clean_up_esm ( df_esm_PANAS )
# %% [markdown]
# ## 1.2 Sensor data
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-11 16:42:30 +02:00
df_proximity = proximity . get_proximity_data ( ptcp_2 )
2021-08-11 17:26:44 +02:00
df_proximity = helper . get_date_from_timestamp ( df_proximity )
2021-08-11 16:42:30 +02:00
df_proximity = proximity . recode_proximity ( df_proximity )
2021-08-12 15:07:20 +02:00
# %% [markdown]
# ## 1.3 Standardization/personalization
2021-08-11 16:42:30 +02:00
# %% [markdown]
# # 2. Grouping/segmentation
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-11 16:42:30 +02:00
df_esm_PANAS_daily_means = (
df_esm_PANAS_clean . groupby ( [ " participant_id " , " date_lj " , " questionnaire_id " ] )
. esm_user_answer_numeric . agg ( " mean " )
. reset_index ( )
. rename ( columns = { " esm_user_answer_numeric " : " esm_numeric_mean " } )
)
2021-08-12 15:07:20 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
df_esm_PANAS_daily_means = (
df_esm_PANAS_daily_means . pivot (
index = [ " participant_id " , " date_lj " ] ,
columns = " questionnaire_id " ,
values = " esm_numeric_mean " ,
)
. reset_index ( col_level = 1 )
. rename ( columns = { 8.0 : " PA " , 9.0 : " NA " } )
. set_index ( [ " participant_id " , " date_lj " ] )
)
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-11-17 10:44:49 +01:00
df_proximity_daily_counts = proximity . count_proximity ( df_proximity , [ " date_lj " ] )
2021-08-12 15:07:20 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 15:07:20 +02:00
df_proximity_daily_counts
# %% [markdown]
# # 3. Join features (and export to csv?)
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
df_full_data_daily_means = df_esm_PANAS_daily_means . join (
df_proximity_daily_counts
) . reset_index ( )
2021-08-12 15:07:20 +02:00
# %% [markdown]
# # 4. Machine learning model and parameters
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
lin_reg_proximity = linear_model . LinearRegression ( )
# %% [markdown]
# ## 4.1 Validation method
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
groups = df_full_data_daily_means [ " participant_id " ] ,
)
# %% [markdown]
# ## 4.2 Fit results (export?)
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
cross_val_score (
lin_reg_proximity ,
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
groups = df_full_data_daily_means [ " participant_id " ] ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 " ,
)
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
lin_reg_proximity . fit (
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
)
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-12 16:54:00 +02:00
lin_reg_proximity . score (
df_full_data_daily_means [ [ " freq_prox_near " , " prop_prox_near " ] ] ,
df_full_data_daily_means [ " PA " ] ,
)
2021-08-12 17:38:08 +02:00
# %% [markdown]
# # Merging these into a pipeline
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-14 17:42:34 +02:00
from machine_learning import features_sensor , labels , model , pipeline
2021-08-12 17:38:08 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-14 17:42:34 +02:00
importlib . reload ( features_sensor )
2021-08-20 19:17:22 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 17:05:44 +02:00
with open ( " ../machine_learning/config/minimal_features.yaml " , " r " ) as file :
2021-08-19 17:23:23 +02:00
sensor_features_params = yaml . safe_load ( file )
2021-08-20 17:59:00 +02:00
print ( sensor_features_params )
2021-08-19 17:23:23 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-13 17:43:47 +02:00
sensor_features = machine_learning . features_sensor . SensorFeatures (
* * sensor_features_params
)
2021-08-19 17:23:23 +02:00
sensor_features . data_types
2021-08-12 19:06:43 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-23 16:36:26 +02:00
sensor_features . set_participants_label ( " nokia_0000003 " )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-20 17:59:00 +02:00
sensor_features . data_types = [ " proximity " , " communication " ]
2021-08-20 19:44:50 +02:00
sensor_features . participants_usernames = ptcp_2
2021-08-20 17:59:00 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 17:05:44 +02:00
sensor_features . get_sensor_data ( " proximity " )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 16:31:42 +02:00
sensor_features . set_sensor_data ( )
2021-08-12 19:06:43 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 16:31:42 +02:00
sensor_features . get_sensor_data ( " proximity " )
2021-08-12 17:38:08 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-14 17:42:34 +02:00
sensor_features . calculate_features ( cached = False )
features_all_calculated = sensor_features . get_features ( " all " , " all " )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-14 17:42:34 +02:00
sensor_features . calculate_features ( cached = True )
features_all_read = sensor_features . get_features ( " all " , " all " )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-14 17:42:34 +02:00
features_all_read = features_all_read . reset_index ( )
features_all_read [ " date_lj " ] = features_all_read [ " date_lj " ] . dt . date
features_all_read . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
2021-08-12 17:38:08 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-14 17:42:34 +02:00
np . isclose ( features_all_read , features_all_calculated ) . all ( )
2021-08-20 17:59:00 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 17:44:04 +02:00
with open ( " ../machine_learning/config/minimal_labels.yaml " , " r " ) as file :
labels_params = yaml . safe_load ( file )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-13 11:41:57 +02:00
labels = machine_learning . labels . Labels ( * * labels_params )
2021-08-20 19:44:50 +02:00
labels . participants_usernames = ptcp_2
2021-09-15 15:36:36 +02:00
labels . set_participants_label ( " nokia_0000003 " )
2021-08-19 17:44:04 +02:00
labels . questionnaires
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 17:44:04 +02:00
labels . set_labels ( )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-19 17:44:04 +02:00
labels . get_labels ( " PANAS " )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-15 15:45:49 +02:00
labels . aggregate_labels ( cached = False )
labels_calculated = labels . get_aggregated_labels ( )
2021-08-20 19:17:22 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-15 15:45:49 +02:00
labels . aggregate_labels ( cached = True )
labels_read = labels . get_aggregated_labels ( )
labels_read = labels_read . reset_index ( )
labels_read [ " date_lj " ] = labels_read [ " date_lj " ] . dt . date
labels_read . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-15 15:45:49 +02:00
np . isclose ( labels_read , labels_calculated ) . all ( )
2021-08-20 19:17:22 +02:00
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-09-13 11:41:57 +02:00
model_validation = machine_learning . model . ModelValidation (
2021-08-20 19:44:50 +02:00
sensor_features . get_features ( " all " , " all " ) ,
labels . get_aggregated_labels ( ) ,
group_variable = " participant_id " ,
cv_name = " loso " ,
)
model_validation . model = linear_model . LinearRegression ( )
model_validation . set_cv_method ( )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-20 19:44:50 +02:00
model_validation . cross_validate ( )
2022-08-23 16:41:41 +02:00
# %% jupyter={"source_hidden": true}
2021-08-20 19:44:50 +02:00
model_validation . groups
2022-08-23 16:41:41 +02:00
# %% [markdown]
# # Use RAPIDS
# %% jupyter={"source_hidden": true}
with open ( here ( " rapids/config.yaml " ) , " r " ) as file :
rapids_config = yaml . safe_load ( file )
# %% jupyter={"source_hidden": true}
for key in rapids_config . keys ( ) :
if isinstance ( rapids_config [ key ] , dict ) : # Remove top-level configs
if ( " PROVIDERS " in rapids_config [ key ] ) : # Retain features (that have providers)
if rapids_config [ key ] [ " PROVIDERS " ] : # Remove non-implemented features
for provider in rapids_config [ key ] [ " PROVIDERS " ] :
if rapids_config [ key ] [ " PROVIDERS " ] [ provider ] [ " COMPUTE " ] : # Check that the features were actually calculated
if " FEATURES " in rapids_config [ key ] [ " PROVIDERS " ] [ provider ] :
print ( key )
print ( provider )
print ( rapids_config [ key ] [ " PROVIDERS " ] [ provider ] [ " FEATURES " ] )
# %% jupyter={"source_hidden": true}
features_rapids = pd . read_csv ( here ( " rapids/data/processed/features/all_participants/all_sensor_features.csv " ) , parse_dates = [ " local_segment_start_datetime " , " local_segment_end_datetime " ] )
# %% jupyter={"source_hidden": true}
features_rapids . columns
# %% jupyter={"source_hidden": true}
features_rapids = features_rapids . assign ( date_lj = lambda x : x . local_segment_start_datetime . dt . date )
# %% jupyter={"source_hidden": true}
features_rapids [ " participant_id " ] = features_rapids [ " pid " ] . str . extract ( " ( \ d+) " )
features_rapids [ " participant_id " ] = pd . to_numeric ( features_rapids [ " participant_id " ] )
features_rapids . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# %% jupyter={"source_hidden": true}
with open ( " ../machine_learning/config/minimal_labels.yaml " , " r " ) as file :
labels_params = yaml . safe_load ( file )
# %% jupyter={"source_hidden": true}
labels = machine_learning . labels . Labels ( * * labels_params )
labels . set_participants_label ( " all " )
# %% jupyter={"source_hidden": true}
labels . aggregate_labels ( cached = True )
labels_read = labels . get_aggregated_labels ( )
labels_read = labels_read . reset_index ( )
labels_read [ " date_lj " ] = labels_read [ " date_lj " ] . dt . date
labels_read . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
# %% jupyter={"source_hidden": true}
features_rapids . shape
# %% jupyter={"source_hidden": true}
labels_read . shape
# %% jupyter={"source_hidden": true}
features_labels = features_rapids . join ( labels_read , how = " inner " ) . reset_index ( )
# %% jupyter={"source_hidden": true}
features_labels . shape
# %% jupyter={"source_hidden": true}
features_labels . columns
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' mean ' )
# %% jupyter={"source_hidden": true}
feature_columns = features_labels . columns [ 6 : - 3 ]
label_column = " NA "
group_column = " pid "
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model . LinearRegression ( )
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
features_labels [ feature_columns ] ,
features_labels [ label_column ] ,
groups = features_labels [ group_column ] ,
)
# %% jupyter={"source_hidden": true}
cross_val_score (
lin_reg_rapids ,
X = imputer . fit_transform ( features_labels [ feature_columns ] ) ,
y = features_labels [ label_column ] ,
groups = features_labels [ group_column ] ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 " ,
)
# %% tags=[]
sns . set ( rc = { " figure.figsize " : ( 16 , 8 ) } )
sns . heatmap ( features_labels [ feature_columns ] . isna ( ) , cbar = False )
# %% [markdown] tags=[]
# ```yaml
# ALL_CLEANING_INDIVIDUAL:
# PROVIDERS:
# RAPIDS:
# COMPUTE: True
# IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below
# COMPUTE: True
# MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero.
# COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable
# COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance
# ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable
# DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
# DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable
# DROP_HIGHLY_CORRELATED_FEATURES:
# COMPUTE: False
# MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
# CORR_THRESHOLD: 0.95
# SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
# ```
# %% jupyter={"source_hidden": true}
features_rapids_cleaned = pd . read_csv ( here ( " rapids/data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv " ) , parse_dates = [ " local_segment_start_datetime " , " local_segment_end_datetime " ] )
features_rapids_cleaned = features_rapids_cleaned . assign ( date_lj = lambda x : x . local_segment_start_datetime . dt . date )
features_rapids_cleaned [ " participant_id " ] = features_rapids_cleaned [ " pid " ] . str . extract ( " ( \ d+) " )
features_rapids_cleaned [ " participant_id " ] = pd . to_numeric ( features_rapids_cleaned [ " participant_id " ] )
features_rapids_cleaned . set_index ( [ " participant_id " , " date_lj " ] , inplace = True )
# %% jupyter={"source_hidden": true}
features_cleaned_labels = features_rapids_cleaned . join ( labels_read , how = " inner " ) . reset_index ( )
feature_clean_columns = features_cleaned_labels . columns [ 6 : - 3 ]
# %% jupyter={"source_hidden": true}
print ( feature_columns . shape )
print ( feature_clean_columns . shape )
# %% tags=[]
sns . set ( rc = { " figure.figsize " : ( 16 , 8 ) } )
sns . heatmap ( features_cleaned_labels [ feature_clean_columns ] . isna ( ) , cbar = False )
# %% jupyter={"source_hidden": true}
lin_reg_rapids_clean = linear_model . LinearRegression ( )
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
features_cleaned_labels [ feature_clean_columns ] ,
features_cleaned_labels [ label_column ] ,
groups = features_cleaned_labels [ group_column ] ,
)
# %% jupyter={"source_hidden": true}
features_clean_imputed = imputer . fit_transform ( features_cleaned_labels [ feature_clean_columns ] )
# %% jupyter={"source_hidden": true}
cross_val_score (
lin_reg_rapids_clean ,
X = features_clean_imputed ,
y = features_cleaned_labels [ label_column ] ,
groups = features_cleaned_labels [ group_column ] ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 " ,
)
# %% jupyter={"source_hidden": true}
lin_reg_full = linear_model . LinearRegression ( )
lin_reg_full . fit ( features_clean_imputed , features_cleaned_labels [ label_column ] )
# %% jupyter={"source_hidden": true}
NA_pred = lin_reg_full . predict ( features_clean_imputed )
# %% jupyter={"source_hidden": true}
# The coefficients
print ( " Coefficients: \n " , lin_reg_full . coef_ )
# The mean squared error
print ( " Mean squared error: %.2f " % mean_squared_error ( features_cleaned_labels [ label_column ] , NA_pred ) )
# The coefficient of determination: 1 is perfect prediction
print ( " Coefficient of determination: %.2f " % r2_score ( features_cleaned_labels [ label_column ] , NA_pred ) )
# %% jupyter={"source_hidden": true}
feature_clean_columns [ np . argmax ( lin_reg_full . coef_ ) ]
# %% [markdown]
# Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine.
# %% jupyter={"source_hidden": true}
plt . scatter ( features_clean_imputed [ : , np . argmax ( lin_reg_full . coef_ ) ] , features_cleaned_labels [ label_column ] , color = " black " )
plt . scatter ( features_clean_imputed [ : , np . argmax ( lin_reg_full . coef_ ) ] , NA_pred , color = " red " , linewidth = 3 )
plt . xticks ( )
plt . yticks ( )
fig = plt . gcf ( )
fig . set_size_inches ( 18.5 , 10.5 )
plt . show ( )
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true}
model_input = pd . read_csv ( " ../rapids/data/processed/models/population_model/input_PANAS_NA.csv " )
# %% jupyter={"source_hidden": true}
model_input . dropna ( axis = 0 , how = " any " , subset = [ " target " ] , inplace = True )
# %% jupyter={"source_hidden": true}
index_columns = [ " local_segment " , " local_segment_label " , " local_segment_start_datetime " , " local_segment_end_datetime " ]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input . set_index ( index_columns , inplace = True )
data_x , data_y , data_groups = model_input . drop ( [ " target " , " pid " ] , axis = 1 ) , model_input [ " target " ] , model_input [ " pid " ]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = [ " gender " , " startlanguage " ]
# %% jupyter={"source_hidden": true}
categorical_features = data_x [ categorical_feature_colnames ] . copy ( )
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features . mode ( ) . iloc [ 0 ]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features . fillna ( mode_categorical_features )
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features . apply ( lambda col : col . astype ( " category " ) )
if not categorical_features . empty :
categorical_features = pd . get_dummies ( categorical_features )
# %% jupyter={"source_hidden": true}
numerical_features = data_x . drop ( categorical_feature_colnames , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x = pd . concat ( [ numerical_features , categorical_features ] , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x . dtypes
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
train_x ,
data_y ,
groups = data_groups ,
)
# %% jupyter={"source_hidden": true}
sum ( data_y . isna ( ) )
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model . LinearRegression ( )
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' mean ' )
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_val_score (
lin_reg_rapids ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
np . median ( lin_reg_scores )
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model . Ridge ( alpha = .5 )
# %% tags=[] jupyter={"source_hidden": true}
cross_val_score (
ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model . Lasso ( alpha = 0.1 )
# %% jupyter={"source_hidden": true}
cross_val_score (
lasso_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model . BayesianRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
bayesian_ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model . RANSACRegressor ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
ransac_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm . SVR ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% jupyter={"source_hidden": true}
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge . KernelRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
kridge ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process . GaussianProcessRegressor ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
gpr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ## PANAS positive affect
# %% jupyter={"source_hidden": true}
model_input = pd . read_csv ( " ../rapids/data/processed/models/population_model/input_PANAS_PA.csv " )
# %% jupyter={"source_hidden": true}
model_input . dropna ( axis = 0 , how = " any " , subset = [ " target " ] , inplace = True )
# %% jupyter={"source_hidden": true}
index_columns = [ " local_segment " , " local_segment_label " , " local_segment_start_datetime " , " local_segment_end_datetime " ]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input . set_index ( index_columns , inplace = True )
data_x , data_y , data_groups = model_input . drop ( [ " target " , " pid " ] , axis = 1 ) , model_input [ " target " ] , model_input [ " pid " ]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = [ " gender " , " startlanguage " ]
# %% jupyter={"source_hidden": true}
categorical_features = data_x [ categorical_feature_colnames ] . copy ( )
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features . mode ( ) . iloc [ 0 ]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features . fillna ( mode_categorical_features )
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features . apply ( lambda col : col . astype ( " category " ) )
if not categorical_features . empty :
categorical_features = pd . get_dummies ( categorical_features )
# %% jupyter={"source_hidden": true}
numerical_features = data_x . drop ( categorical_feature_colnames , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x = pd . concat ( [ numerical_features , categorical_features ] , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x . dtypes
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
train_x ,
data_y ,
groups = data_groups ,
)
# %% jupyter={"source_hidden": true}
sum ( data_y . isna ( ) )
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model . LinearRegression ( )
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' mean ' )
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_val_score (
lin_reg_rapids ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
np . median ( lin_reg_scores )
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model . Ridge ( alpha = .5 )
# %% tags=[] jupyter={"source_hidden": true}
cross_val_score (
ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model . Lasso ( alpha = 0.1 )
# %% jupyter={"source_hidden": true}
cross_val_score (
lasso_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model . BayesianRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
bayesian_ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model . RANSACRegressor ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
ransac_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm . SVR ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% jupyter={"source_hidden": true}
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge . KernelRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
kridge ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process . GaussianProcessRegressor ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
gpr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ## JCQ demand
# %% jupyter={"source_hidden": true}
model_input = pd . read_csv ( " ../rapids/data/processed/models/population_model/input_JCQ_demand.csv " )
# %% jupyter={"source_hidden": true}
model_input . dropna ( axis = 0 , how = " any " , subset = [ " target " ] , inplace = True )
# %% jupyter={"source_hidden": true}
index_columns = [ " local_segment " , " local_segment_label " , " local_segment_start_datetime " , " local_segment_end_datetime " ]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input . set_index ( index_columns , inplace = True )
data_x , data_y , data_groups = model_input . drop ( [ " target " , " pid " ] , axis = 1 ) , model_input [ " target " ] , model_input [ " pid " ]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = [ " gender " , " startlanguage " ]
# %% jupyter={"source_hidden": true}
categorical_features = data_x [ categorical_feature_colnames ] . copy ( )
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features . mode ( ) . iloc [ 0 ]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features . fillna ( mode_categorical_features )
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features . apply ( lambda col : col . astype ( " category " ) )
if not categorical_features . empty :
categorical_features = pd . get_dummies ( categorical_features )
# %% jupyter={"source_hidden": true}
numerical_features = data_x . drop ( categorical_feature_colnames , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x = pd . concat ( [ numerical_features , categorical_features ] , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x . dtypes
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
train_x ,
data_y ,
groups = data_groups ,
)
# %% jupyter={"source_hidden": true}
sum ( data_y . isna ( ) )
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model . LinearRegression ( )
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' mean ' )
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_val_score (
lin_reg_rapids ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
np . median ( lin_reg_scores )
# %% jupyter={"source_hidden": true}
lin_reg_scores
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model . Ridge ( alpha = .5 )
# %% tags=[] jupyter={"source_hidden": true}
cross_val_score (
ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model . Lasso ( alpha = 0.1 )
# %% jupyter={"source_hidden": true}
cross_val_score (
lasso_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model . BayesianRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
bayesian_ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model . RANSACRegressor ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
ransac_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm . SVR ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% jupyter={"source_hidden": true}
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge . KernelRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
kridge ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process . GaussianProcessRegressor ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
gpr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ## JCQ control
# %% jupyter={"source_hidden": true}
model_input = pd . read_csv ( " ../rapids/data/processed/models/population_model/input_JCQ_control.csv " )
# %% jupyter={"source_hidden": true}
model_input . dropna ( axis = 0 , how = " any " , subset = [ " target " ] , inplace = True )
# %% jupyter={"source_hidden": true}
index_columns = [ " local_segment " , " local_segment_label " , " local_segment_start_datetime " , " local_segment_end_datetime " ]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input . set_index ( index_columns , inplace = True )
data_x , data_y , data_groups = model_input . drop ( [ " target " , " pid " ] , axis = 1 ) , model_input [ " target " ] , model_input [ " pid " ]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = [ " gender " , " startlanguage " ]
# %% jupyter={"source_hidden": true}
categorical_features = data_x [ categorical_feature_colnames ] . copy ( )
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features . mode ( ) . iloc [ 0 ]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features . fillna ( mode_categorical_features )
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features . apply ( lambda col : col . astype ( " category " ) )
if not categorical_features . empty :
categorical_features = pd . get_dummies ( categorical_features )
# %% jupyter={"source_hidden": true}
numerical_features = data_x . drop ( categorical_feature_colnames , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x = pd . concat ( [ numerical_features , categorical_features ] , axis = 1 )
# %% jupyter={"source_hidden": true}
train_x . dtypes
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut ( )
logo . get_n_splits (
train_x ,
data_y ,
groups = data_groups ,
)
# %% jupyter={"source_hidden": true}
sum ( data_y . isna ( ) )
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model . LinearRegression ( )
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer ( missing_values = np . nan , strategy = ' mean ' )
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_val_score (
lin_reg_rapids ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
np . median ( lin_reg_scores )
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model . Ridge ( alpha = .5 )
# %% tags=[] jupyter={"source_hidden": true}
cross_val_score (
ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model . Lasso ( alpha = 0.1 )
# %% jupyter={"source_hidden": true}
cross_val_score (
lasso_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model . BayesianRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
bayesian_ridge_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model . RANSACRegressor ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
ransac_reg ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm . SVR ( )
# %% jupyter={"source_hidden": true}
np . median (
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
)
# %% jupyter={"source_hidden": true}
cross_val_score (
svr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown] tags=[]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge . KernelRidge ( )
# %% jupyter={"source_hidden": true}
cross_val_score (
kridge ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process . GaussianProcessRegressor ( )
# %% jupyter={"source_hidden": true} tags=[]
cross_val_score (
gpr ,
X = imputer . fit_transform ( train_x ) ,
y = data_y ,
groups = data_groups ,
cv = logo ,
n_jobs = - 1 ,
scoring = " r2 "
)
# %% jupyter={"source_hidden": true}