corrected esm_features index column
parent
e7bb9d6702
commit
4db8810d08
31
config.yaml
31
config.yaml
|
@ -219,21 +219,22 @@ PHONE_CONVERSATION: # TODO Adapt for speech
|
|||
|
||||
# See https://www.rapids.science/latest/features/phone-data-yield/
|
||||
PHONE_DATA_YIELD:
|
||||
SENSORS: [#PHONE_ACCELEROMETER,
|
||||
PHONE_ACTIVITY_RECOGNITION,
|
||||
PHONE_APPLICATIONS_FOREGROUND,
|
||||
PHONE_APPLICATIONS_NOTIFICATIONS,
|
||||
PHONE_BATTERY,
|
||||
PHONE_BLUETOOTH,
|
||||
PHONE_CALLS,
|
||||
PHONE_LIGHT,
|
||||
PHONE_LOCATIONS,
|
||||
PHONE_MESSAGES,
|
||||
PHONE_SCREEN,
|
||||
PHONE_WIFI_VISIBLE]
|
||||
SENSORS: [ #PHONE_ACCELEROMETER,
|
||||
#PHONE_ACTIVITY_RECOGNITION,
|
||||
#PHONE_APPLICATIONS_FOREGROUND,
|
||||
#PHONE_APPLICATIONS_NOTIFICATIONS,
|
||||
#PHONE_BATTERY,
|
||||
PHONE_BLUETOOTH #,
|
||||
#PHONE_CALLS,
|
||||
#PHONE_LIGHT,
|
||||
#PHONE_LOCATIONS,
|
||||
#PHONE_MESSAGES,
|
||||
#PHONE_SCREEN,
|
||||
#PHONE_WIFI_VISIBLE
|
||||
]
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
|
||||
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
|
||||
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
|
||||
|
@ -662,9 +663,9 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
ALL_CLEANING_INDIVIDUAL:
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
|
|
|
@ -198,7 +198,7 @@ def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
|
|||
|
||||
|
||||
|
||||
def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
|
||||
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
|
||||
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
|
||||
> n_others: Number of other people interacted with in the last 10 minutes
|
||||
- -1: Number is positive but unknown exactly
|
||||
|
@ -219,6 +219,8 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
|
|||
Returns:
|
||||
pd.DataFrame: _description_
|
||||
"""
|
||||
|
||||
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
|
||||
properties = {"n_others":[],
|
||||
"inperson":[],
|
||||
"formal":[]}
|
||||
|
@ -251,8 +253,10 @@ def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame:
|
|||
properties["formal"].append(formal)
|
||||
|
||||
|
||||
#df = df.join(pd.DataFrame(properties,index=df.index))
|
||||
return pd.DataFrame(properties,index=df.index)
|
||||
df = df.join(pd.DataFrame(properties,index=df.index))
|
||||
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -54,38 +54,38 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
|||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
||||
if not esm_data.empty:
|
||||
# print(esm_data.head())
|
||||
# print(time_segment)
|
||||
esm_data = filter_data_by_segment(esm_data, time_segment)
|
||||
if not esm_data.empty:
|
||||
esm_features = pd.DataFrame()
|
||||
for scale in requested_scales:
|
||||
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
||||
mask = esm_data["questionnaire_id"] == questionnaire_id
|
||||
#print(esm_data.loc[mask].head())
|
||||
#print(time_segment)
|
||||
if not mask.any():
|
||||
temp = sensor_data_files["sensor_data"]
|
||||
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}",RuntimeWarning)
|
||||
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning)
|
||||
continue
|
||||
#TODO: calculation of LTM features
|
||||
if scale=="activities":
|
||||
requested_subset = [req[len("activities_"):] for req in requested_features if req.startswith("activities")]
|
||||
requested_subset = [req for req in requested_features if req.startswith("activities")]
|
||||
if not bool(requested_subset):
|
||||
continue
|
||||
# ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
|
||||
# print(esm_data["esm_json"].values)
|
||||
# print(mask)
|
||||
# print(esm_data.loc[mask])
|
||||
# print(ltm_features)
|
||||
# #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
|
||||
print(esm_data["local_segment"])
|
||||
if(type(esm_data["local_segment"].values[0]) != str):
|
||||
raise Exception("wrong dtype of local_segment")
|
||||
#print(esm_data.loc[mask]["local_segment"])
|
||||
ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
|
||||
print(ltm_features)
|
||||
esm_features[["activities_"+req for req in requested_subset]] = ltm_features[requested_subset]
|
||||
#print("PRINTING ltm_features:\n",ltm_features)
|
||||
ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
|
||||
esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
|
||||
#print(esm_features.columns)
|
||||
#print("PRINTING esm_features after rename:\n",ltm_features)
|
||||
#FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
|
||||
continue
|
||||
|
||||
#print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n LTM FEATURES STORED... AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
|
||||
if("mean" in features_to_compute):
|
||||
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
|
||||
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
||||
|
||||
|
@ -94,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
|||
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
|
||||
|
||||
return esm_features
|
||||
|
||||
def test_main():
|
||||
import temp_help
|
||||
provider = {
|
||||
"FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
|
||||
"SCALES":['activities']
|
||||
}
|
||||
sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
|
||||
s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
|
||||
print(s_feat)
|
||||
|
||||
#test_main()
|
|
@ -0,0 +1,70 @@
|
|||
"""This file is TEMPORARY and intended for testing main.py
|
||||
"""
|
||||
|
||||
def filter_data_by_segment(data, time_segment):
|
||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||
if(data.shape[0] == 0): # data is empty
|
||||
data["local_segment"] = data["timestamps_segment"] = None
|
||||
return data
|
||||
|
||||
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
timestamps_regex = "[0-9]{13}"
|
||||
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
|
||||
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
|
||||
data = data.drop(columns=["assigned_segments"])
|
||||
data = data.dropna(subset = ["local_segment"])
|
||||
if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
|
||||
data["timestamps_segment"] = None
|
||||
else:
|
||||
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
|
||||
|
||||
# chunk episodes
|
||||
if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
|
||||
data = chunk_episodes(data)
|
||||
|
||||
return data
|
||||
|
||||
def chunk_episodes(sensor_episodes):
|
||||
import copy
|
||||
import pandas as pd
|
||||
|
||||
# Deduplicate episodes
|
||||
# Drop rows where segments of start_timestamp and end_timestamp are the same
|
||||
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
|
||||
|
||||
# Delete useless columns
|
||||
for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
|
||||
del sensor_episodes[drop_col]
|
||||
|
||||
# Avoid SettingWithCopyWarning
|
||||
sensor_episodes = sensor_episodes.copy()
|
||||
|
||||
# Unix timestamp for current segment in milliseconds
|
||||
sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
|
||||
|
||||
# Compute chunked timestamp
|
||||
sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
|
||||
sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
|
||||
|
||||
# Compute duration: intersection of current row and segment
|
||||
sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
|
||||
|
||||
# Merge episodes
|
||||
cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
|
||||
|
||||
sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
|
||||
merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
|
||||
|
||||
merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
|
||||
merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
|
||||
|
||||
merged_sensor_episodes.reset_index(inplace=True)
|
||||
|
||||
# Compute datetime
|
||||
merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
|
||||
merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
|
||||
|
||||
merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
|
||||
merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
|
||||
|
||||
return merged_sensor_episodes
|
Loading…
Reference in New Issue