Enable reading features from csv files.

rapids
junos 2021-09-14 17:42:34 +02:00
parent af9e81fe40
commit 28699a0fdf
2 changed files with 102 additions and 33 deletions

View File

@ -20,6 +20,8 @@ import importlib
import os import os
import sys import sys
import numpy as np
import pandas as pd
import seaborn as sns import seaborn as sns
import yaml import yaml
from sklearn import linear_model from sklearn import linear_model
@ -37,7 +39,7 @@ import machine_learning.model
import participants.query_db import participants.query_db
from features import esm, helper, proximity from features import esm, helper, proximity
# %% [markdown] # %% [markdown] tags=[]
# # 1. Get the relevant data # # 1. Get the relevant data
# %% # %%
@ -47,7 +49,7 @@ participants_inactive_usernames = participants.query_db.get_usernames(
# Consider only two participants to simplify. # Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames[0:2] ptcp_2 = participants_inactive_usernames[0:2]
# %% [markdown] # %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
# ## 1.1 Labels # ## 1.1 Labels
# %% # %%
@ -98,7 +100,7 @@ df_esm_PANAS_daily_means = (
# %% # %%
df_proximity_daily_counts = proximity.count_proximity( df_proximity_daily_counts = proximity.count_proximity(
df_proximity, ["participant_id", "date_lj"] df_proximity, ["date_lj"]
) )
# %% # %%
@ -159,10 +161,10 @@ lin_reg_proximity.score(
# # Merging these into a pipeline # # Merging these into a pipeline
# %% # %%
from machine_learning import pipeline from machine_learning import features_sensor, labels, model, pipeline
# %% # %%
importlib.reload(pipeline) importlib.reload(features_sensor)
# %% # %%
with open("../machine_learning/config/minimal_features.yaml", "r") as file: with open("../machine_learning/config/minimal_features.yaml", "r") as file:
@ -192,10 +194,22 @@ sensor_features.set_sensor_data()
sensor_features.get_sensor_data("proximity") sensor_features.get_sensor_data("proximity")
# %% # %%
sensor_features.calculate_features() sensor_features.calculate_features(cached=False)
features_all_calculated = sensor_features.get_features("all", "all")
# %% # %%
sensor_features.get_features("all", "all") sensor_features.calculate_features(cached=True)
features_all_read = sensor_features.get_features("all", "all")
# %%
features_all_read = features_all_read.reset_index()
features_all_read["date_lj"] = features_all_read["date_lj"].dt.date
features_all_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
# %%
np.isclose(features_all_read, features_all_calculated).all()
# %% # %%
with open("../machine_learning/config/minimal_labels.yaml", "r") as file: with open("../machine_learning/config/minimal_labels.yaml", "r") as file:

View File

@ -128,35 +128,69 @@ class SensorFeatures:
else: else:
raise KeyError("This data type has not been implemented.") raise KeyError("This data type has not been implemented.")
def calculate_features(self): def calculate_features(self, cached=True):
print("Calculating features ...") print("Calculating features ...")
if not self.participants_label: if not self.participants_label:
raise ValueError(WARNING_PARTICIPANTS_LABEL) raise ValueError(WARNING_PARTICIPANTS_LABEL)
self.df_features_all = pd.DataFrame()
if "proximity" in self.data_types: if "proximity" in self.data_types:
self.df_proximity_counts = proximity.count_proximity( try:
self.df_proximity, self.grouping_variable if not cached: # Do not use the file, even if it exists.
) raise FileNotFoundError
self.df_features_all = safe_outer_merge_on_index( self.df_proximity_counts = read_csv_with_settings(
self.df_features_all, self.df_proximity_counts self.folder,
) self.filename_prefix,
print("Calculated proximity features.") data_type="prox",
to_csv_with_settings( grouping_variable=self.grouping_variable,
self.df_proximity, self.folder, self.filename_prefix, data_type="prox" )
) print("Read proximity features from the file.")
except FileNotFoundError:
# We need to recalculate the features in this case.
self.df_proximity_counts = proximity.count_proximity(
self.df_proximity, self.grouping_variable
)
print("Calculated proximity features.")
to_csv_with_settings(
self.df_proximity_counts,
self.folder,
self.filename_prefix,
data_type="prox",
)
finally:
self.df_features_all = safe_outer_merge_on_index(
self.df_features_all, self.df_proximity_counts
)
if "communication" in self.data_types: if "communication" in self.data_types:
self.df_calls_sms = communication.calls_sms_features( try:
df_calls=self.df_calls, if not cached: # Do not use the file, even if it exists.
df_sms=self.df_sms, raise FileNotFoundError
group_by=self.grouping_variable, self.df_calls_sms = read_csv_with_settings(
) self.folder,
self.df_features_all = safe_outer_merge_on_index( self.filename_prefix,
self.df_features_all, self.df_calls_sms data_type="comm",
) grouping_variable=self.grouping_variable,
print("Calculated communication features.") )
to_csv_with_settings( print("Read communication features from the file.")
self.df_calls_sms, self.folder, self.filename_prefix, data_type="comm" except FileNotFoundError:
) # We need to recalculate the features in this case.
self.df_calls_sms = communication.calls_sms_features(
df_calls=self.df_calls,
df_sms=self.df_sms,
group_by=self.grouping_variable,
)
print("Calculated communication features.")
to_csv_with_settings(
self.df_calls_sms,
self.folder,
self.filename_prefix,
data_type="comm",
)
finally:
self.df_features_all = safe_outer_merge_on_index(
self.df_features_all, self.df_calls_sms
)
self.df_features_all.fillna( self.df_features_all.fillna(
value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer", value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer",
@ -211,14 +245,35 @@ def safe_outer_merge_on_index(left, right):
def to_csv_with_settings( def to_csv_with_settings(
df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
) -> None: ) -> None:
export_filename = filename_prefix + "_" + data_type + ".csv" full_path = construct_full_path(folder, filename_prefix, data_type)
full_path = folder / export_filename
df.to_csv( df.to_csv(
path_or_buf=full_path, path_or_buf=full_path,
sep=",", sep=",",
na_rep="NA", na_rep="NA",
header=True, header=True,
index=False, index=True,
encoding="utf-8", encoding="utf-8",
) )
print("Exported the dataframe to " + str(full_path)) print("Exported the dataframe to " + str(full_path))
def read_csv_with_settings(
folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
) -> pd.DataFrame:
full_path = construct_full_path(folder, filename_prefix, data_type)
return pd.read_csv(
filepath_or_buffer=full_path,
sep=",",
header=0,
na_values="NA",
encoding="utf-8",
index_col=(["participant_id"] + grouping_variable),
parse_dates=True,
infer_datetime_format=True,
)
def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
export_filename = filename_prefix + "_" + data_type + ".csv"
full_path = folder / export_filename
return full_path