From 4db8810d08ef7a673609939f09612e5fd641af1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Martin=C5=A1ek?= Date: Fri, 31 Mar 2023 13:08:15 +0000 Subject: [PATCH] corrected esm_features index column --- config.yaml | 31 ++++---- .../phone_esm/straw/esm_activities.py | 10 ++- src/features/phone_esm/straw/main.py | 38 ++++++---- src/features/phone_esm/straw/temp_help.py | 70 +++++++++++++++++++ 4 files changed, 118 insertions(+), 31 deletions(-) create mode 100644 src/features/phone_esm/straw/temp_help.py diff --git a/config.yaml b/config.yaml index e9d522a5..827f962b 100644 --- a/config.yaml +++ b/config.yaml @@ -219,21 +219,22 @@ PHONE_CONVERSATION: # TODO Adapt for speech # See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [#PHONE_ACCELEROMETER, - PHONE_ACTIVITY_RECOGNITION, - PHONE_APPLICATIONS_FOREGROUND, - PHONE_APPLICATIONS_NOTIFICATIONS, - PHONE_BATTERY, - PHONE_BLUETOOTH, - PHONE_CALLS, - PHONE_LIGHT, - PHONE_LOCATIONS, - PHONE_MESSAGES, - PHONE_SCREEN, - PHONE_WIFI_VISIBLE] + SENSORS: [ #PHONE_ACCELEROMETER, + #PHONE_ACTIVITY_RECOGNITION, + #PHONE_APPLICATIONS_FOREGROUND, + #PHONE_APPLICATIONS_NOTIFICATIONS, + #PHONE_BATTERY, + PHONE_BLUETOOTH #, + #PHONE_CALLS, + #PHONE_LIGHT, + #PHONE_LOCATIONS, + #PHONE_MESSAGES, + #PHONE_SCREEN, + #PHONE_WIFI_VISIBLE + ] PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid. SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R @@ -662,9 +663,9 @@ HEATMAP_FEATURE_CORRELATION_MATRIX: ALL_CLEANING_INDIVIDUAL: PROVIDERS: RAPIDS: - COMPUTE: False + COMPUTE: True IMPUTE_SELECTED_EVENT_FEATURES: - COMPUTE: False + COMPUTE: True MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 COLS_NAN_THRESHOLD: 1 # set to 1 to disable COLS_VAR_THRESHOLD: True diff --git a/src/features/phone_esm/straw/esm_activities.py b/src/features/phone_esm/straw/esm_activities.py index 38f166bd..16b6f72e 100644 --- a/src/features/phone_esm/straw/esm_activities.py +++ b/src/features/phone_esm/straw/esm_activities.py @@ -198,7 +198,7 @@ def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame: -def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame: +def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy: """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes: > n_others: Number of other people interacted with in the last 10 minutes - -1: Number is positive but unknown exactly @@ -219,6 +219,8 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame: Returns: pd.DataFrame: _description_ """ + + #print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment) properties = {"n_others":[], "inperson":[], "formal":[]} @@ -251,8 +253,10 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame: properties["formal"].append(formal) - #df = df.join(pd.DataFrame(properties,index=df.index)) - return pd.DataFrame(properties,index=df.index) + df = df.join(pd.DataFrame(properties,index=df.index)) + #print("APPLY END:\ndf=",df[["n_others","inperson","formal"]]) + + return df diff --git a/src/features/phone_esm/straw/main.py b/src/features/phone_esm/straw/main.py index 6729137a..943958ae 100644 --- a/src/features/phone_esm/straw/main.py +++ b/src/features/phone_esm/straw/main.py @@ -54,39 +54,39 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg features_to_compute = list(set(requested_features) & set(base_features_names)) esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not esm_data.empty: - # print(esm_data.head()) - # print(time_segment) esm_data = filter_data_by_segment(esm_data, time_segment) if not esm_data.empty: esm_features = pd.DataFrame() for scale in requested_scales: questionnaire_id = QUESTIONNAIRE_IDS[scale] mask = esm_data["questionnaire_id"] == questionnaire_id + #print(esm_data.loc[mask].head()) + #print(time_segment) if not mask.any(): temp = sensor_data_files["sensor_data"] - warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}",RuntimeWarning) + warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning) continue #TODO: calculation of LTM features if scale=="activities": - requested_subset = [req[len("activities_"):] for req in requested_features if req.startswith("activities")] + requested_subset = [req for req in requested_features if req.startswith("activities")] if not bool(requested_subset): continue # ltm_features = esm_activities_LTM_features(esm_data.loc[mask]) # print(esm_data["esm_json"].values) # print(mask) # print(esm_data.loc[mask]) - # print(ltm_features) # #ltm_features = ltm_features[ltm_features["correct_ids"==44]] - print(esm_data["local_segment"]) - if(type(esm_data["local_segment"].values[0]) != str): - raise Exception("wrong dtype of local_segment") + #print(esm_data.loc[mask]["local_segment"]) ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation) - print(ltm_features) - esm_features[["activities_"+req for req in requested_subset]] = ltm_features[requested_subset] + #print("PRINTING ltm_features:\n",ltm_features) + ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True) + esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset] + #print(esm_features.columns) + #print("PRINTING esm_features after rename:\n",ltm_features) #FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment" - continue - - esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean() + #print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n LTM FEATURES STORED... AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]]) + if("mean" in features_to_compute): + esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean() #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing. esm_features = esm_features.reset_index() @@ -94,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg esm_features.rename(columns={'index': 'local_segment'}, inplace=True) return esm_features + +def test_main(): + import temp_help + provider = { + "FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"], + "SCALES":['activities'] + } + sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"} + s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment) + print(s_feat) + +#test_main() \ No newline at end of file diff --git a/src/features/phone_esm/straw/temp_help.py b/src/features/phone_esm/straw/temp_help.py new file mode 100644 index 00000000..bc7b1ebc --- /dev/null +++ b/src/features/phone_esm/straw/temp_help.py @@ -0,0 +1,70 @@ +"""This file is TEMPORARY and intended for testing main.py +""" + +def filter_data_by_segment(data, time_segment): + data.dropna(subset=["assigned_segments"], inplace=True) + if(data.shape[0] == 0): # data is empty + data["local_segment"] = data["timestamps_segment"] = None + return data + + datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" + timestamps_regex = "[0-9]{13}" + segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex) + data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True) + data = data.drop(columns=["assigned_segments"]) + data = data.dropna(subset = ["local_segment"]) + if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na + data["timestamps_segment"] = None + else: + data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True) + + # chunk episodes + if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns): + data = chunk_episodes(data) + + return data + +def chunk_episodes(sensor_episodes): + import copy + import pandas as pd + + # Deduplicate episodes + # Drop rows where segments of start_timestamp and end_timestamp are the same + sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first") + + # Delete useless columns + for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]: + del sensor_episodes[drop_col] + + # Avoid SettingWithCopyWarning + sensor_episodes = sensor_episodes.copy() + + # Unix timestamp for current segment in milliseconds + sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int) + + # Compute chunked timestamp + sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1) + sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1) + + # Compute duration: intersection of current row and segment + sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60) + + # Merge episodes + cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]] + + sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False) + merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum() + + merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first() + merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last() + + merged_sensor_episodes.reset_index(inplace=True) + + # Compute datetime + merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True) + merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0)) + + merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True) + merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0)) + + return merged_sensor_episodes \ No newline at end of file