diff --git a/config/environment.yml b/config/environment.yml index a64d1e1..d0a0923 100644 --- a/config/environment.yml +++ b/config/environment.yml @@ -15,6 +15,7 @@ dependencies: - psycopg2 - python-dotenv - pytz + - pyprojroot - pyyaml - seaborn - scikit-learn diff --git a/exploration/ex_ml_pipeline.py b/exploration/ex_ml_pipeline.py index 45c687d..077c142 100644 --- a/exploration/ex_ml_pipeline.py +++ b/exploration/ex_ml_pipeline.py @@ -169,6 +169,9 @@ print(sensor_features_params) sensor_features = pipeline.SensorFeatures(**sensor_features_params) sensor_features.data_types +# %% +sensor_features.set_participants_label("nokia_0000003") + # %% sensor_features.data_types = ["proximity", "communication"] sensor_features.participants_usernames = ptcp_2 diff --git a/features/proximity.py b/features/proximity.py index 36c9569..dcf4cc3 100644 --- a/features/proximity.py +++ b/features/proximity.py @@ -7,7 +7,7 @@ from setup import db_engine, session FILL_NA_PROXIMITY = { "freq_prox_near": 0, - "prop_prox_near": 1/2 # Of the form of a / (a + b). + "prop_prox_near": 1 / 2, # Of the form of a / (a + b). } FEATURES_PROXIMITY = list(FILL_NA_PROXIMITY.keys()) diff --git a/machine_learning/config/minimal_features.yaml b/machine_learning/config/minimal_features.yaml index 027dd29..a015607 100644 --- a/machine_learning/config/minimal_features.yaml +++ b/machine_learning/config/minimal_features.yaml @@ -1,4 +1,4 @@ -grouping_variable: [date_lj] +grouping_variable: date_lj features: proximity: all diff --git a/machine_learning/pipeline.py b/machine_learning/pipeline.py index f1ecb44..eb7e992 100644 --- a/machine_learning/pipeline.py +++ b/machine_learning/pipeline.py @@ -1,9 +1,12 @@ import datetime +import warnings from collections.abc import Collection +from pathlib import Path import numpy as np import pandas as pd import yaml +from pyprojroot import here from sklearn import linear_model from sklearn.model_selection import LeaveOneGroupOut, cross_val_score @@ -11,22 +14,32 @@ import participants.query_db from features import communication, esm, helper, proximity from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME +WARNING_PARTICIPANTS_LABEL = ( + "Before calculating features, please set participants label using self.set_participants_label() " + "to be used as a filename prefix when exporting data. " + "The filename will be of the form: %participants_label_%grouping_variable_%data_type.csv" +) + class SensorFeatures: def __init__( self, - grouping_variable: list, + grouping_variable: str, features: dict, participants_usernames: Collection = None, ): - self.grouping_variable = grouping_variable + + self.grouping_variable_name = grouping_variable + self.grouping_variable = [grouping_variable] self.data_types = features.keys() + self.participants_label: str = "" if participants_usernames is None: participants_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) + self.participants_label = "all" self.participants_usernames = participants_usernames self.df_features_all = pd.DataFrame() @@ -37,6 +50,10 @@ class SensorFeatures: self.df_calls = pd.DataFrame() self.df_sms = pd.DataFrame() self.df_calls_sms = pd.DataFrame() + + self.folder = None + self.filename_prefix = "" + self.construct_export_path() print("SensorFeatures initialized.") def set_sensor_data(self): @@ -67,6 +84,8 @@ class SensorFeatures: def calculate_features(self): print("Calculating features ...") + if not self.participants_label: + raise ValueError(WARNING_PARTICIPANTS_LABEL) if "proximity" in self.data_types: self.df_proximity_counts = proximity.count_proximity( self.df_proximity, self.grouping_variable @@ -75,6 +94,9 @@ class SensorFeatures: self.df_features_all, self.df_proximity_counts ) print("Calculated proximity features.") + to_csv_with_settings( + self.df_proximity, self.folder, self.filename_prefix, data_type="prox" + ) if "communication" in self.data_types: self.df_calls_sms = communication.calls_sms_features( @@ -86,16 +108,15 @@ class SensorFeatures: self.df_features_all, self.df_calls_sms ) print("Calculated communication features.") + to_csv_with_settings( + self.df_calls_sms, self.folder, self.filename_prefix, data_type="comm" + ) self.df_features_all.fillna( - value=proximity.FILL_NA_PROXIMITY, - inplace=True, - downcast="infer", + value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer", ) self.df_features_all.fillna( - value=communication.FILL_NA_CALLS_SMS_ALL, - inplace=True, - downcast="infer", + value=communication.FILL_NA_CALLS_SMS_ALL, inplace=True, downcast="infer", ) def get_features(self, data_type, feature_names) -> pd.DataFrame: @@ -112,6 +133,18 @@ class SensorFeatures: else: raise KeyError("This data type has not been implemented.") + def construct_export_path(self): + if not self.participants_label: + warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning) + self.folder = here("machine_learning/intermediate_results/features", warn=True) + self.filename_prefix = ( + self.participants_label + "_" + self.grouping_variable_name + ) + + def set_participants_label(self, label: str): + self.participants_label = label + self.construct_export_path() + class Labels: def __init__( @@ -252,111 +285,20 @@ def safe_outer_merge_on_index(left, right): ) -class MachineLearningPipeline: - def __init__( - self, - labels_questionnaire, - labels_scale, - data_types, - participants_usernames=None, - feature_names=None, - grouping_variable=None, - ): - if participants_usernames is None: - participants_usernames = participants.query_db.get_usernames( - collection_start=datetime.date.fromisoformat("2020-08-01") - ) - self.participants_usernames = participants_usernames - self.labels_questionnaire = labels_questionnaire - self.data_types = data_types - - if feature_names is None: - self.feature_names = [] - self.df_features = pd.DataFrame() - self.labels_scale = labels_scale - self.df_labels = pd.DataFrame() - self.grouping_variable = grouping_variable - self.df_groups = pd.DataFrame() - - self.model = None - self.validation_method = None - - self.df_esm = pd.DataFrame() - self.df_esm_preprocessed = pd.DataFrame() - self.df_esm_interest = pd.DataFrame() - self.df_esm_clean = pd.DataFrame() - - self.df_full_data_daily_means = pd.DataFrame() - self.df_esm_daily_means = pd.DataFrame() - self.df_proximity_daily_counts = pd.DataFrame() - - # def get_labels(self): - # self.df_esm = esm.get_esm_data(self.participants_usernames) - # self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm) - # if self.labels_questionnaire == "PANAS": - # self.df_esm_interest = self.df_esm_preprocessed[ - # ( - # self.df_esm_preprocessed["questionnaire_id"] - # == QUESTIONNAIRE_IDS.get("PANAS").get("PA") - # ) - # | ( - # self.df_esm_preprocessed["questionnaire_id"] - # == QUESTIONNAIRE_IDS.get("PANAS").get("NA") - # ) - # ] - # self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest) - - # def aggregate_daily(self): - # self.df_esm_daily_means = ( - # self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"]) - # .esm_user_answer_numeric.agg("mean") - # .reset_index() - # .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) - # ) - # self.df_esm_daily_means = ( - # self.df_esm_daily_means.pivot( - # index=["participant_id", "date_lj"], - # columns="questionnaire_id", - # values="esm_numeric_mean", - # ) - # .reset_index(col_level=1) - # .rename(columns=QUESTIONNAIRE_IDS_RENAME) - # .set_index(["participant_id", "date_lj"]) - # ) - # self.df_full_data_daily_means = self.df_esm_daily_means.copy() - # if "proximity" in self.data_types: - # self.df_proximity_daily_counts = proximity.count_proximity( - # self.df_proximity, ["participant_id", "date_lj"] - # ) - # self.df_full_data_daily_means = self.df_full_data_daily_means.join( - # self.df_proximity_daily_counts - # ) - - def assign_columns(self): - self.df_features = self.df_full_data_daily_means[self.feature_names] - self.df_labels = self.df_full_data_daily_means[self.labels_scale] - if self.grouping_variable: - self.df_groups = self.df_full_data_daily_means[self.grouping_variable] - else: - self.df_groups = None - - def validate_model(self): - if self.model is None: - raise AttributeError( - "Please, specify a machine learning model first, by setting the .model attribute." - ) - if self.validation_method is None: - raise AttributeError( - "Please, specify a cross validation method first, by setting the .validation_method attribute." - ) - cross_val_score( - estimator=self.model, - X=self.df_features, - y=self.df_labels, - groups=self.df_groups, - cv=self.validation_method, - n_jobs=-1, - ) +def to_csv_with_settings( + df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str +) -> None: + export_filename = filename_prefix + "_" + data_type + ".csv" + full_path = folder / export_filename + df.to_csv( + path_or_buf=full_path, + sep=",", + na_rep="NA", + header=True, + index=False, + encoding="utf-8", + ) + print("Exported the dataframe to " + str(full_path)) if __name__ == "__main__":