corrected esm_features index column

2023-03-31 13:08:15 +00:00 · 2023-03-31 13:08:15 +00:00 · 4db8810d08
parent e7bb9d6702
commit 4db8810d08
4 changed files with 118 additions and 31 deletions
--- a/config.yaml
+++ b/config.yaml
@ -219,21 +219,22 @@ PHONE_CONVERSATION: # TODO Adapt for speech

 # See https://www.rapids.science/latest/features/phone-data-yield/
 PHONE_DATA_YIELD:
-  SENSORS: [#PHONE_ACCELEROMETER,
-            PHONE_ACTIVITY_RECOGNITION,
-            PHONE_APPLICATIONS_FOREGROUND,
-            PHONE_APPLICATIONS_NOTIFICATIONS,
-            PHONE_BATTERY,
-            PHONE_BLUETOOTH,
-            PHONE_CALLS,
-            PHONE_LIGHT,
-            PHONE_LOCATIONS,
-            PHONE_MESSAGES,
-            PHONE_SCREEN,
-            PHONE_WIFI_VISIBLE]
+  SENSORS: [ #PHONE_ACCELEROMETER,
+            #PHONE_ACTIVITY_RECOGNITION,
+            #PHONE_APPLICATIONS_FOREGROUND,
+            #PHONE_APPLICATIONS_NOTIFICATIONS,
+            #PHONE_BATTERY,
+            PHONE_BLUETOOTH #,
+            #PHONE_CALLS,
+            #PHONE_LIGHT,
+            #PHONE_LOCATIONS,
+            #PHONE_MESSAGES,
+            #PHONE_SCREEN,
+            #PHONE_WIFI_VISIBLE
+            ]
  PROVIDERS:
    RAPIDS:
-      COMPUTE: False
+      COMPUTE: True
      FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
      MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
      SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
@ -662,9 +663,9 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
 ALL_CLEANING_INDIVIDUAL:
  PROVIDERS:
    RAPIDS:
-      COMPUTE: False
+      COMPUTE: True
      IMPUTE_SELECTED_EVENT_FEATURES:
-        COMPUTE: False
+        COMPUTE: True
        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
      COLS_NAN_THRESHOLD: 1 # set to 1 to disable
      COLS_VAR_THRESHOLD: True
--- a/src/features/phone_esm/straw/esm_activities.py
+++ b/src/features/phone_esm/straw/esm_activities.py
@ -198,7 +198,7 @@ def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:



-def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
+def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
    """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
        > n_others: Number of other people interacted with in the last 10 minutes
            - -1: Number is positive but unknown exactly
@ -219,6 +219,8 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
    Returns:
        pd.DataFrame: _description_
    """    
+
+    #print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
    properties = {"n_others":[],
                  "inperson":[],
                  "formal":[]}
@ -251,8 +253,10 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
    properties["formal"].append(formal)


-    #df = df.join(pd.DataFrame(properties,index=df.index))
-    return pd.DataFrame(properties,index=df.index)
+    df = df.join(pd.DataFrame(properties,index=df.index))
+    #print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
+
+    return df



--- a/src/features/phone_esm/straw/main.py
+++ b/src/features/phone_esm/straw/main.py
@ -54,39 +54,39 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
    features_to_compute = list(set(requested_features) & set(base_features_names))
    esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
    if not esm_data.empty:
-        # print(esm_data.head())
-        # print(time_segment)
        esm_data = filter_data_by_segment(esm_data, time_segment)
        if not esm_data.empty:
            esm_features = pd.DataFrame()
            for scale in requested_scales:
                questionnaire_id = QUESTIONNAIRE_IDS[scale]
                mask = esm_data["questionnaire_id"] == questionnaire_id
+                #print(esm_data.loc[mask].head())
+                #print(time_segment)
                if not mask.any():
                    temp = sensor_data_files["sensor_data"]
-                    warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}",RuntimeWarning) 
+                    warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning) 
                    continue
                #TODO: calculation of LTM features
                if scale=="activities":
-                    requested_subset = [req[len("activities_"):] for req in requested_features if req.startswith("activities")]
+                    requested_subset = [req for req in requested_features if req.startswith("activities")]
                    if not bool(requested_subset):
                        continue
                    # ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
                    # print(esm_data["esm_json"].values)
                    # print(mask)
                    # print(esm_data.loc[mask])
-                    # print(ltm_features)
                    # #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
-                    print(esm_data["local_segment"])
-                    if(type(esm_data["local_segment"].values[0]) != str):
-                        raise Exception("wrong dtype of local_segment")
+                    #print(esm_data.loc[mask]["local_segment"])
                    ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
-                    print(ltm_features)
-                    esm_features[["activities_"+req for req in requested_subset]] = ltm_features[requested_subset]
+                    #print("PRINTING ltm_features:\n",ltm_features)
+                    ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
+                    esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
+                    #print(esm_features.columns)
+                    #print("PRINTING esm_features after rename:\n",ltm_features)
                    #FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
-                    continue
-
-                esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
+                    #print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n     LTM FEATURES STORED...     AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
+                if("mean" in features_to_compute):
+                    esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
                #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.

            esm_features = esm_features.reset_index()
@ -94,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
                esm_features.rename(columns={'index': 'local_segment'}, inplace=True)

    return esm_features
+
+def test_main():
+    import temp_help
+    provider = {
+        "FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
+        "SCALES":['activities']
+    }
+    sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
+    s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
+    print(s_feat)
+
+#test_main()
--- a/src/features/phone_esm/straw/temp_help.py
+++ b/src/features/phone_esm/straw/temp_help.py
@ -0,0 +1,70 @@
+"""This file is TEMPORARY and intended for testing main.py
+"""
+
+def filter_data_by_segment(data, time_segment):
+    data.dropna(subset=["assigned_segments"], inplace=True)
+    if(data.shape[0] == 0): # data is empty
+        data["local_segment"] = data["timestamps_segment"] = None
+        return data
+
+    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
+    timestamps_regex = "[0-9]{13}"
+    segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
+    data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
+    data = data.drop(columns=["assigned_segments"])
+    data = data.dropna(subset = ["local_segment"])
+    if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
+        data["timestamps_segment"] = None
+    else:
+        data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
+    
+    # chunk episodes
+    if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
+        data = chunk_episodes(data)
+    
+    return data
+
+def chunk_episodes(sensor_episodes):
+    import copy
+    import pandas as pd
+
+    # Deduplicate episodes
+    # Drop rows where segments of start_timestamp and end_timestamp are the same
+    sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
+
+    # Delete useless columns
+    for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
+        del sensor_episodes[drop_col]
+    
+    # Avoid SettingWithCopyWarning
+    sensor_episodes = sensor_episodes.copy()
+
+    # Unix timestamp for current segment in milliseconds
+    sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
+
+    # Compute chunked timestamp
+    sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
+    sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
+
+    # Compute duration: intersection of current row and segment
+    sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
+
+    # Merge episodes
+    cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
+
+    sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
+    merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
+
+    merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
+    merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
+
+    merged_sensor_episodes.reset_index(inplace=True)
+
+    # Compute datetime
+    merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
+    merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
+
+    merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
+    merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
+
+    return merged_sensor_episodes