corrected esm_features index column

sociality-task
Marcel Martinšek 2023-03-31 13:08:15 +00:00
parent e7bb9d6702
commit 4db8810d08
4 changed files with 118 additions and 31 deletions

View File

@ -219,21 +219,22 @@ PHONE_CONVERSATION: # TODO Adapt for speech
# See https://www.rapids.science/latest/features/phone-data-yield/
PHONE_DATA_YIELD:
SENSORS: [#PHONE_ACCELEROMETER,
PHONE_ACTIVITY_RECOGNITION,
PHONE_APPLICATIONS_FOREGROUND,
PHONE_APPLICATIONS_NOTIFICATIONS,
PHONE_BATTERY,
PHONE_BLUETOOTH,
PHONE_CALLS,
PHONE_LIGHT,
PHONE_LOCATIONS,
PHONE_MESSAGES,
PHONE_SCREEN,
PHONE_WIFI_VISIBLE]
SENSORS: [ #PHONE_ACCELEROMETER,
#PHONE_ACTIVITY_RECOGNITION,
#PHONE_APPLICATIONS_FOREGROUND,
#PHONE_APPLICATIONS_NOTIFICATIONS,
#PHONE_BATTERY,
PHONE_BLUETOOTH #,
#PHONE_CALLS,
#PHONE_LIGHT,
#PHONE_LOCATIONS,
#PHONE_MESSAGES,
#PHONE_SCREEN,
#PHONE_WIFI_VISIBLE
]
PROVIDERS:
RAPIDS:
COMPUTE: False
COMPUTE: True
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
@ -662,9 +663,9 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
ALL_CLEANING_INDIVIDUAL:
PROVIDERS:
RAPIDS:
COMPUTE: False
COMPUTE: True
IMPUTE_SELECTED_EVENT_FEATURES:
COMPUTE: False
COMPUTE: True
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
COLS_VAR_THRESHOLD: True

View File

@ -198,7 +198,7 @@ def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
@ -219,6 +219,8 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
Returns:
pd.DataFrame: _description_
"""
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
properties = {"n_others":[],
"inperson":[],
"formal":[]}
@ -251,8 +253,10 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
properties["formal"].append(formal)
#df = df.join(pd.DataFrame(properties,index=df.index))
return pd.DataFrame(properties,index=df.index)
df = df.join(pd.DataFrame(properties,index=df.index))
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
return df

View File

@ -54,39 +54,39 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
features_to_compute = list(set(requested_features) & set(base_features_names))
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not esm_data.empty:
# print(esm_data.head())
# print(time_segment)
esm_data = filter_data_by_segment(esm_data, time_segment)
if not esm_data.empty:
esm_features = pd.DataFrame()
for scale in requested_scales:
questionnaire_id = QUESTIONNAIRE_IDS[scale]
mask = esm_data["questionnaire_id"] == questionnaire_id
#print(esm_data.loc[mask].head())
#print(time_segment)
if not mask.any():
temp = sensor_data_files["sensor_data"]
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}",RuntimeWarning)
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning)
continue
#TODO: calculation of LTM features
if scale=="activities":
requested_subset = [req[len("activities_"):] for req in requested_features if req.startswith("activities")]
requested_subset = [req for req in requested_features if req.startswith("activities")]
if not bool(requested_subset):
continue
# ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
# print(esm_data["esm_json"].values)
# print(mask)
# print(esm_data.loc[mask])
# print(ltm_features)
# #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
print(esm_data["local_segment"])
if(type(esm_data["local_segment"].values[0]) != str):
raise Exception("wrong dtype of local_segment")
#print(esm_data.loc[mask]["local_segment"])
ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
print(ltm_features)
esm_features[["activities_"+req for req in requested_subset]] = ltm_features[requested_subset]
#print("PRINTING ltm_features:\n",ltm_features)
ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
#print(esm_features.columns)
#print("PRINTING esm_features after rename:\n",ltm_features)
#FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
continue
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
#print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n LTM FEATURES STORED... AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
if("mean" in features_to_compute):
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
esm_features = esm_features.reset_index()
@ -94,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
return esm_features
def test_main():
import temp_help
provider = {
"FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
"SCALES":['activities']
}
sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
print(s_feat)
#test_main()

View File

@ -0,0 +1,70 @@
"""This file is TEMPORARY and intended for testing main.py
"""
def filter_data_by_segment(data, time_segment):
data.dropna(subset=["assigned_segments"], inplace=True)
if(data.shape[0] == 0): # data is empty
data["local_segment"] = data["timestamps_segment"] = None
return data
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
timestamps_regex = "[0-9]{13}"
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
data = data.drop(columns=["assigned_segments"])
data = data.dropna(subset = ["local_segment"])
if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
data["timestamps_segment"] = None
else:
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
# chunk episodes
if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
data = chunk_episodes(data)
return data
def chunk_episodes(sensor_episodes):
import copy
import pandas as pd
# Deduplicate episodes
# Drop rows where segments of start_timestamp and end_timestamp are the same
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
# Delete useless columns
for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
del sensor_episodes[drop_col]
# Avoid SettingWithCopyWarning
sensor_episodes = sensor_episodes.copy()
# Unix timestamp for current segment in milliseconds
sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
# Compute chunked timestamp
sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
# Compute duration: intersection of current row and segment
sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
# Merge episodes
cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
merged_sensor_episodes.reset_index(inplace=True)
# Compute datetime
merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
return merged_sensor_episodes