diff --git a/exploration/ex_ml_pipeline.py b/exploration/ex_ml_pipeline.py index 3f5c85f..cd3c293 100644 --- a/exploration/ex_ml_pipeline.py +++ b/exploration/ex_ml_pipeline.py @@ -61,6 +61,9 @@ df_proximity = proximity.get_proximity_data(ptcp_2) df_proximity = helper.get_date_from_timestamp(df_proximity) df_proximity = proximity.recode_proximity(df_proximity) +# %% [markdown] +# ## 1.3 Standardization/personalization + # %% [markdown] # # 2. Grouping/segmentation @@ -71,3 +74,19 @@ df_esm_PANAS_daily_means = ( .reset_index() .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"}) ) + +# %% +df_proximity_daily_counts = proximity.count_proximity( + df_proximity, ["participant_id", "date_lj"] +) + +# %% +df_proximity_daily_counts + +# %% [markdown] +# # 3. Join features (and export to csv?) + +# %% [markdown] +# # 4. Machine learning model and parameters + +# %% diff --git a/features/proximity.py b/features/proximity.py index c50eda4..a1e4f9a 100644 --- a/features/proximity.py +++ b/features/proximity.py @@ -55,7 +55,9 @@ def recode_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame: return df_proximity -def count_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame: +def count_proximity( + df_proximity: pd.DataFrame, group_by: Collection = ["participant_id"] +) -> pd.DataFrame: """ The function counts how many times a "near" value occurs in proximity and calculates the proportion of this counts to all proximity values (i.e. relative count). @@ -64,6 +66,9 @@ def count_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame: ---------- df_proximity: pd.DataFrame A dataframe of proximity data. + group_by: Collection + A list of strings, specifying by which parameters to group. + By default, the features are calculated per participant, but could be "date_lj" etc. Returns ------- @@ -73,7 +78,7 @@ def count_proximity(df_proximity: pd.DataFrame) -> pd.DataFrame: if "bool_prox_near" not in df_proximity: df_proximity = recode_proximity(df_proximity) df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"] - df_proximity_features = df_proximity.groupby("participant_id").sum()[ + df_proximity_features = df_proximity.groupby(group_by).sum()[ ["bool_prox_near", "bool_prox_far"] ] df_proximity_features = df_proximity_features.assign(