Save calculated features to csv files.

rapids
junos 2021-08-23 16:36:26 +02:00
parent 0152fbe4ac
commit c1bb4ddf0f
5 changed files with 61 additions and 115 deletions

View File

@ -15,6 +15,7 @@ dependencies:
- psycopg2 - psycopg2
- python-dotenv - python-dotenv
- pytz - pytz
- pyprojroot
- pyyaml - pyyaml
- seaborn - seaborn
- scikit-learn - scikit-learn

View File

@ -169,6 +169,9 @@ print(sensor_features_params)
sensor_features = pipeline.SensorFeatures(**sensor_features_params) sensor_features = pipeline.SensorFeatures(**sensor_features_params)
sensor_features.data_types sensor_features.data_types
# %%
sensor_features.set_participants_label("nokia_0000003")
# %% # %%
sensor_features.data_types = ["proximity", "communication"] sensor_features.data_types = ["proximity", "communication"]
sensor_features.participants_usernames = ptcp_2 sensor_features.participants_usernames = ptcp_2

View File

@ -7,7 +7,7 @@ from setup import db_engine, session
FILL_NA_PROXIMITY = { FILL_NA_PROXIMITY = {
"freq_prox_near": 0, "freq_prox_near": 0,
"prop_prox_near": 1/2 # Of the form of a / (a + b). "prop_prox_near": 1 / 2, # Of the form of a / (a + b).
} }
FEATURES_PROXIMITY = list(FILL_NA_PROXIMITY.keys()) FEATURES_PROXIMITY = list(FILL_NA_PROXIMITY.keys())

View File

@ -1,4 +1,4 @@
grouping_variable: [date_lj] grouping_variable: date_lj
features: features:
proximity: proximity:
all all

View File

@ -1,9 +1,12 @@
import datetime import datetime
import warnings
from collections.abc import Collection from collections.abc import Collection
from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import yaml import yaml
from pyprojroot import here
from sklearn import linear_model from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
@ -11,22 +14,32 @@ import participants.query_db
from features import communication, esm, helper, proximity from features import communication, esm, helper, proximity
from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME
WARNING_PARTICIPANTS_LABEL = (
"Before calculating features, please set participants label using self.set_participants_label() "
"to be used as a filename prefix when exporting data. "
"The filename will be of the form: %participants_label_%grouping_variable_%data_type.csv"
)
class SensorFeatures: class SensorFeatures:
def __init__( def __init__(
self, self,
grouping_variable: list, grouping_variable: str,
features: dict, features: dict,
participants_usernames: Collection = None, participants_usernames: Collection = None,
): ):
self.grouping_variable = grouping_variable
self.grouping_variable_name = grouping_variable
self.grouping_variable = [grouping_variable]
self.data_types = features.keys() self.data_types = features.keys()
self.participants_label: str = ""
if participants_usernames is None: if participants_usernames is None:
participants_usernames = participants.query_db.get_usernames( participants_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01") collection_start=datetime.date.fromisoformat("2020-08-01")
) )
self.participants_label = "all"
self.participants_usernames = participants_usernames self.participants_usernames = participants_usernames
self.df_features_all = pd.DataFrame() self.df_features_all = pd.DataFrame()
@ -37,6 +50,10 @@ class SensorFeatures:
self.df_calls = pd.DataFrame() self.df_calls = pd.DataFrame()
self.df_sms = pd.DataFrame() self.df_sms = pd.DataFrame()
self.df_calls_sms = pd.DataFrame() self.df_calls_sms = pd.DataFrame()
self.folder = None
self.filename_prefix = ""
self.construct_export_path()
print("SensorFeatures initialized.") print("SensorFeatures initialized.")
def set_sensor_data(self): def set_sensor_data(self):
@ -67,6 +84,8 @@ class SensorFeatures:
def calculate_features(self): def calculate_features(self):
print("Calculating features ...") print("Calculating features ...")
if not self.participants_label:
raise ValueError(WARNING_PARTICIPANTS_LABEL)
if "proximity" in self.data_types: if "proximity" in self.data_types:
self.df_proximity_counts = proximity.count_proximity( self.df_proximity_counts = proximity.count_proximity(
self.df_proximity, self.grouping_variable self.df_proximity, self.grouping_variable
@ -75,6 +94,9 @@ class SensorFeatures:
self.df_features_all, self.df_proximity_counts self.df_features_all, self.df_proximity_counts
) )
print("Calculated proximity features.") print("Calculated proximity features.")
to_csv_with_settings(
self.df_proximity, self.folder, self.filename_prefix, data_type="prox"
)
if "communication" in self.data_types: if "communication" in self.data_types:
self.df_calls_sms = communication.calls_sms_features( self.df_calls_sms = communication.calls_sms_features(
@ -86,16 +108,15 @@ class SensorFeatures:
self.df_features_all, self.df_calls_sms self.df_features_all, self.df_calls_sms
) )
print("Calculated communication features.") print("Calculated communication features.")
to_csv_with_settings(
self.df_calls_sms, self.folder, self.filename_prefix, data_type="comm"
)
self.df_features_all.fillna( self.df_features_all.fillna(
value=proximity.FILL_NA_PROXIMITY, value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer",
inplace=True,
downcast="infer",
) )
self.df_features_all.fillna( self.df_features_all.fillna(
value=communication.FILL_NA_CALLS_SMS_ALL, value=communication.FILL_NA_CALLS_SMS_ALL, inplace=True, downcast="infer",
inplace=True,
downcast="infer",
) )
def get_features(self, data_type, feature_names) -> pd.DataFrame: def get_features(self, data_type, feature_names) -> pd.DataFrame:
@ -112,6 +133,18 @@ class SensorFeatures:
else: else:
raise KeyError("This data type has not been implemented.") raise KeyError("This data type has not been implemented.")
def construct_export_path(self):
if not self.participants_label:
warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
self.folder = here("machine_learning/intermediate_results/features", warn=True)
self.filename_prefix = (
self.participants_label + "_" + self.grouping_variable_name
)
def set_participants_label(self, label: str):
self.participants_label = label
self.construct_export_path()
class Labels: class Labels:
def __init__( def __init__(
@ -252,111 +285,20 @@ def safe_outer_merge_on_index(left, right):
) )
class MachineLearningPipeline: def to_csv_with_settings(
def __init__( df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
self, ) -> None:
labels_questionnaire, export_filename = filename_prefix + "_" + data_type + ".csv"
labels_scale, full_path = folder / export_filename
data_types, df.to_csv(
participants_usernames=None, path_or_buf=full_path,
feature_names=None, sep=",",
grouping_variable=None, na_rep="NA",
): header=True,
if participants_usernames is None: index=False,
participants_usernames = participants.query_db.get_usernames( encoding="utf-8",
collection_start=datetime.date.fromisoformat("2020-08-01") )
) print("Exported the dataframe to " + str(full_path))
self.participants_usernames = participants_usernames
self.labels_questionnaire = labels_questionnaire
self.data_types = data_types
if feature_names is None:
self.feature_names = []
self.df_features = pd.DataFrame()
self.labels_scale = labels_scale
self.df_labels = pd.DataFrame()
self.grouping_variable = grouping_variable
self.df_groups = pd.DataFrame()
self.model = None
self.validation_method = None
self.df_esm = pd.DataFrame()
self.df_esm_preprocessed = pd.DataFrame()
self.df_esm_interest = pd.DataFrame()
self.df_esm_clean = pd.DataFrame()
self.df_full_data_daily_means = pd.DataFrame()
self.df_esm_daily_means = pd.DataFrame()
self.df_proximity_daily_counts = pd.DataFrame()
# def get_labels(self):
# self.df_esm = esm.get_esm_data(self.participants_usernames)
# self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
# if self.labels_questionnaire == "PANAS":
# self.df_esm_interest = self.df_esm_preprocessed[
# (
# self.df_esm_preprocessed["questionnaire_id"]
# == QUESTIONNAIRE_IDS.get("PANAS").get("PA")
# )
# | (
# self.df_esm_preprocessed["questionnaire_id"]
# == QUESTIONNAIRE_IDS.get("PANAS").get("NA")
# )
# ]
# self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest)
# def aggregate_daily(self):
# self.df_esm_daily_means = (
# self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
# .esm_user_answer_numeric.agg("mean")
# .reset_index()
# .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
# )
# self.df_esm_daily_means = (
# self.df_esm_daily_means.pivot(
# index=["participant_id", "date_lj"],
# columns="questionnaire_id",
# values="esm_numeric_mean",
# )
# .reset_index(col_level=1)
# .rename(columns=QUESTIONNAIRE_IDS_RENAME)
# .set_index(["participant_id", "date_lj"])
# )
# self.df_full_data_daily_means = self.df_esm_daily_means.copy()
# if "proximity" in self.data_types:
# self.df_proximity_daily_counts = proximity.count_proximity(
# self.df_proximity, ["participant_id", "date_lj"]
# )
# self.df_full_data_daily_means = self.df_full_data_daily_means.join(
# self.df_proximity_daily_counts
# )
def assign_columns(self):
self.df_features = self.df_full_data_daily_means[self.feature_names]
self.df_labels = self.df_full_data_daily_means[self.labels_scale]
if self.grouping_variable:
self.df_groups = self.df_full_data_daily_means[self.grouping_variable]
else:
self.df_groups = None
def validate_model(self):
if self.model is None:
raise AttributeError(
"Please, specify a machine learning model first, by setting the .model attribute."
)
if self.validation_method is None:
raise AttributeError(
"Please, specify a cross validation method first, by setting the .validation_method attribute."
)
cross_val_score(
estimator=self.model,
X=self.df_features,
y=self.df_labels,
groups=self.df_groups,
cv=self.validation_method,
n_jobs=-1,
)
if __name__ == "__main__": if __name__ == "__main__":