diff --git a/machine_learning/pipeline.py b/machine_learning/pipeline.py index 5b57f53..572c7a5 100644 --- a/machine_learning/pipeline.py +++ b/machine_learning/pipeline.py @@ -8,6 +8,53 @@ from features import esm, helper, proximity from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME +class SensorFeatures: + def __init__( + self, + grouping_variable, + data_types, + feature_names=None, + participants_usernames=None, + ): + self.data_types = data_types + self.grouping_variable = grouping_variable + + if feature_names is None: + self.feature_names = [] + + if participants_usernames is None: + participants_usernames = participants.query_db.get_usernames( + collection_start=datetime.date.fromisoformat("2020-08-01") + ) + self.participants_usernames = participants_usernames + + self.df_proximity = pd.DataFrame() + self.df_proximity_counts = pd.DataFrame() + + def set_sensor_data(self): + if "proximity" in self.data_types: + self.df_proximity = proximity.get_proximity_data( + self.participants_usernames + ) + self.df_proximity = helper.get_date_from_timestamp(self.df_proximity) + self.df_proximity = proximity.recode_proximity(self.df_proximity) + + def get_sensor_data(self, data_type) -> pd.DataFrame: + # TODO implement the getter (Check if it has been set.) + return self.df_proximity + + def calculate_features(self): + if "proximity" in self.data_types: + self.df_proximity_counts = proximity.count_proximity( + self.df_proximity, ["participant_id", self.grouping_variable] + ) + # TODO Think about joining dataframes. + + def get_features(self, data_type) -> pd.DataFrame: + # TODO implement the getter (Check if it has been set.) + return self.df_proximity_counts + + class MachineLearningPipeline: def __init__( self, @@ -42,8 +89,6 @@ class MachineLearningPipeline: self.df_esm_interest = pd.DataFrame() self.df_esm_clean = pd.DataFrame() - self.df_proximity = pd.DataFrame() - self.df_full_data_daily_means = pd.DataFrame() self.df_esm_daily_means = pd.DataFrame() self.df_proximity_daily_counts = pd.DataFrame() @@ -64,39 +109,31 @@ class MachineLearningPipeline: ] self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest) - def get_sensor_data(self): - if "proximity" in self.data_types: - self.df_proximity = proximity.get_proximity_data( - self.participants_usernames - ) - self.df_proximity = helper.get_date_from_timestamp(self.df_proximity) - self.df_proximity = proximity.recode_proximity(self.df_proximity) - - def aggregate_daily(self): - self.df_esm_daily_means = ( - self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"]) - .esm_user_answer_numeric.agg("mean") - .reset_index() - .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) - ) - self.df_esm_daily_means = ( - self.df_esm_daily_means.pivot( - index=["participant_id", "date_lj"], - columns="questionnaire_id", - values="esm_numeric_mean", - ) - .reset_index(col_level=1) - .rename(columns=QUESTIONNAIRE_IDS_RENAME) - .set_index(["participant_id", "date_lj"]) - ) - self.df_full_data_daily_means = self.df_esm_daily_means.copy() - if "proximity" in self.data_types: - self.df_proximity_daily_counts = proximity.count_proximity( - self.df_proximity, ["participant_id", "date_lj"] - ) - self.df_full_data_daily_means = self.df_full_data_daily_means.join( - self.df_proximity_daily_counts - ) + # def aggregate_daily(self): + # self.df_esm_daily_means = ( + # self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"]) + # .esm_user_answer_numeric.agg("mean") + # .reset_index() + # .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) + # ) + # self.df_esm_daily_means = ( + # self.df_esm_daily_means.pivot( + # index=["participant_id", "date_lj"], + # columns="questionnaire_id", + # values="esm_numeric_mean", + # ) + # .reset_index(col_level=1) + # .rename(columns=QUESTIONNAIRE_IDS_RENAME) + # .set_index(["participant_id", "date_lj"]) + # ) + # self.df_full_data_daily_means = self.df_esm_daily_means.copy() + # if "proximity" in self.data_types: + # self.df_proximity_daily_counts = proximity.count_proximity( + # self.df_proximity, ["participant_id", "date_lj"] + # ) + # self.df_full_data_daily_means = self.df_full_data_daily_means.join( + # self.df_proximity_daily_counts + # ) def assign_columns(self): self.df_features = self.df_full_data_daily_means[self.feature_names]