From 4db8810d08ef7a673609939f09612e5fd641af1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20Martin=C5=A1ek?= <franse.martinsek@ijs.si>
Date: Fri, 31 Mar 2023 13:08:15 +0000
Subject: [PATCH] corrected esm_features index column

---
 config.yaml                                   | 31 ++++----
 .../phone_esm/straw/esm_activities.py         | 10 ++-
 src/features/phone_esm/straw/main.py          | 38 ++++++----
 src/features/phone_esm/straw/temp_help.py     | 70 +++++++++++++++++++
 4 files changed, 118 insertions(+), 31 deletions(-)
 create mode 100644 src/features/phone_esm/straw/temp_help.py

diff --git a/config.yaml b/config.yaml
index e9d522a5..827f962b 100644
--- a/config.yaml
+++ b/config.yaml
@@ -219,21 +219,22 @@ PHONE_CONVERSATION: # TODO Adapt for speech
 
 # See https://www.rapids.science/latest/features/phone-data-yield/
 PHONE_DATA_YIELD:
-  SENSORS: [#PHONE_ACCELEROMETER,
-            PHONE_ACTIVITY_RECOGNITION,
-            PHONE_APPLICATIONS_FOREGROUND,
-            PHONE_APPLICATIONS_NOTIFICATIONS,
-            PHONE_BATTERY,
-            PHONE_BLUETOOTH,
-            PHONE_CALLS,
-            PHONE_LIGHT,
-            PHONE_LOCATIONS,
-            PHONE_MESSAGES,
-            PHONE_SCREEN,
-            PHONE_WIFI_VISIBLE]
+  SENSORS: [ #PHONE_ACCELEROMETER,
+            #PHONE_ACTIVITY_RECOGNITION,
+            #PHONE_APPLICATIONS_FOREGROUND,
+            #PHONE_APPLICATIONS_NOTIFICATIONS,
+            #PHONE_BATTERY,
+            PHONE_BLUETOOTH #,
+            #PHONE_CALLS,
+            #PHONE_LIGHT,
+            #PHONE_LOCATIONS,
+            #PHONE_MESSAGES,
+            #PHONE_SCREEN,
+            #PHONE_WIFI_VISIBLE
+            ]
   PROVIDERS:
     RAPIDS:
-      COMPUTE: False
+      COMPUTE: True
       FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
       MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
       SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
@@ -662,9 +663,9 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
 ALL_CLEANING_INDIVIDUAL:
   PROVIDERS:
     RAPIDS:
-      COMPUTE: False
+      COMPUTE: True
       IMPUTE_SELECTED_EVENT_FEATURES:
-        COMPUTE: False
+        COMPUTE: True
         MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
       COLS_NAN_THRESHOLD: 1 # set to 1 to disable
       COLS_VAR_THRESHOLD: True
diff --git a/src/features/phone_esm/straw/esm_activities.py b/src/features/phone_esm/straw/esm_activities.py
index 38f166bd..16b6f72e 100644
--- a/src/features/phone_esm/straw/esm_activities.py
+++ b/src/features/phone_esm/straw/esm_activities.py
@@ -198,7 +198,7 @@ def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
 
 
 
-def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
+def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
     """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
         > n_others: Number of other people interacted with in the last 10 minutes
             - -1: Number is positive but unknown exactly
@@ -219,6 +219,8 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
     Returns:
         pd.DataFrame: _description_
     """    
+
+    #print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
     properties = {"n_others":[],
                   "inperson":[],
                   "formal":[]}
@@ -251,8 +253,10 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
     properties["formal"].append(formal)
 
 
-    #df = df.join(pd.DataFrame(properties,index=df.index))
-    return pd.DataFrame(properties,index=df.index)
+    df = df.join(pd.DataFrame(properties,index=df.index))
+    #print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
+
+    return df
 
 
 
diff --git a/src/features/phone_esm/straw/main.py b/src/features/phone_esm/straw/main.py
index 6729137a..943958ae 100644
--- a/src/features/phone_esm/straw/main.py
+++ b/src/features/phone_esm/straw/main.py
@@ -54,39 +54,39 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
     features_to_compute = list(set(requested_features) & set(base_features_names))
     esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
     if not esm_data.empty:
-        # print(esm_data.head())
-        # print(time_segment)
         esm_data = filter_data_by_segment(esm_data, time_segment)
         if not esm_data.empty:
             esm_features = pd.DataFrame()
             for scale in requested_scales:
                 questionnaire_id = QUESTIONNAIRE_IDS[scale]
                 mask = esm_data["questionnaire_id"] == questionnaire_id
+                #print(esm_data.loc[mask].head())
+                #print(time_segment)
                 if not mask.any():
                     temp = sensor_data_files["sensor_data"]
-                    warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}",RuntimeWarning) 
+                    warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning) 
                     continue
                 #TODO: calculation of LTM features
                 if scale=="activities":
-                    requested_subset = [req[len("activities_"):] for req in requested_features if req.startswith("activities")]
+                    requested_subset = [req for req in requested_features if req.startswith("activities")]
                     if not bool(requested_subset):
                         continue
                     # ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
                     # print(esm_data["esm_json"].values)
                     # print(mask)
                     # print(esm_data.loc[mask])
-                    # print(ltm_features)
                     # #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
-                    print(esm_data["local_segment"])
-                    if(type(esm_data["local_segment"].values[0]) != str):
-                        raise Exception("wrong dtype of local_segment")
+                    #print(esm_data.loc[mask]["local_segment"])
                     ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
-                    print(ltm_features)
-                    esm_features[["activities_"+req for req in requested_subset]] = ltm_features[requested_subset]
+                    #print("PRINTING ltm_features:\n",ltm_features)
+                    ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
+                    esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
+                    #print(esm_features.columns)
+                    #print("PRINTING esm_features after rename:\n",ltm_features)
                     #FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
-                    continue
-
-                esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
+                    #print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n     LTM FEATURES STORED...     AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
+                if("mean" in features_to_compute):
+                    esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
                 #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
 
             esm_features = esm_features.reset_index()
@@ -94,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
                 esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
 
     return esm_features
+
+def test_main():
+    import temp_help
+    provider = {
+        "FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
+        "SCALES":['activities']
+    }
+    sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
+    s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
+    print(s_feat)
+
+#test_main()
\ No newline at end of file
diff --git a/src/features/phone_esm/straw/temp_help.py b/src/features/phone_esm/straw/temp_help.py
new file mode 100644
index 00000000..bc7b1ebc
--- /dev/null
+++ b/src/features/phone_esm/straw/temp_help.py
@@ -0,0 +1,70 @@
+"""This file is TEMPORARY and intended for testing main.py
+"""
+
+def filter_data_by_segment(data, time_segment):
+    data.dropna(subset=["assigned_segments"], inplace=True)
+    if(data.shape[0] == 0): # data is empty
+        data["local_segment"] = data["timestamps_segment"] = None
+        return data
+
+    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
+    timestamps_regex = "[0-9]{13}"
+    segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
+    data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
+    data = data.drop(columns=["assigned_segments"])
+    data = data.dropna(subset = ["local_segment"])
+    if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
+        data["timestamps_segment"] = None
+    else:
+        data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
+    
+    # chunk episodes
+    if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
+        data = chunk_episodes(data)
+    
+    return data
+
+def chunk_episodes(sensor_episodes):
+    import copy
+    import pandas as pd
+
+    # Deduplicate episodes
+    # Drop rows where segments of start_timestamp and end_timestamp are the same
+    sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
+
+    # Delete useless columns
+    for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
+        del sensor_episodes[drop_col]
+    
+    # Avoid SettingWithCopyWarning
+    sensor_episodes = sensor_episodes.copy()
+
+    # Unix timestamp for current segment in milliseconds
+    sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
+
+    # Compute chunked timestamp
+    sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
+    sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
+
+    # Compute duration: intersection of current row and segment
+    sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
+
+    # Merge episodes
+    cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
+
+    sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
+    merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
+
+    merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
+    merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
+
+    merged_sensor_episodes.reset_index(inplace=True)
+
+    # Compute datetime
+    merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
+    merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
+
+    merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
+    merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
+
+    return merged_sensor_episodes
\ No newline at end of file