diff --git a/config.yaml b/config.yaml index 1c106e30..e9d522a5 100644 --- a/config.yaml +++ b/config.yaml @@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs TAILORED_EVENTS: # Only relevant if TYPE=EVENT COMPUTE: True - SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event + SEGMENTING_METHOD: "10_before" # 30_before, 90_before, stress_event INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes] IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes] @@ -91,7 +91,7 @@ PHONE_ACTIVITY_RECOGNITION: EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode. PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] ACTIVITY_CLASSES: STATIONARY: ["still", "tilting"] @@ -120,7 +120,7 @@ PHONE_APPLICATIONS_FOREGROUND: SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False INCLUDE_EPISODE_FEATURES: True SINGLE_CATEGORIES: ["all", "email"] MULTIPLE_CATEGORIES: @@ -155,7 +155,7 @@ PHONE_BATTERY: EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] SRC_SCRIPT: src/features/phone_battery/rapids/main.py @@ -169,7 +169,7 @@ PHONE_BLUETOOTH: SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R DORYAB: - COMPUTE: True + COMPUTE: False FEATURES: ALL: DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] @@ -190,7 +190,7 @@ PHONE_CALLS: CONTAINER: call PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES_TYPE: EPISODES # EVENTS or EPISODES CALL_TYPES: [missed, incoming, outgoing] FEATURES: @@ -233,7 +233,7 @@ PHONE_DATA_YIELD: PHONE_WIFI_VISIBLE] PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid. SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R @@ -243,9 +243,8 @@ PHONE_ESM: PROVIDERS: STRAW: COMPUTE: True - SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", - "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] - FEATURES: [mean] + SCALES: ["activities"] + FEATURES: [activities_n_others, activities_inperson, activities_formal] SRC_SCRIPT: src/features/phone_esm/straw/main.py # See https://www.rapids.science/latest/features/phone-keyboard/ @@ -262,7 +261,7 @@ PHONE_LIGHT: CONTAINER: light_sensor PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] SRC_SCRIPT: src/features/phone_light/rapids/main.py @@ -276,7 +275,7 @@ PHONE_LOCATIONS: PROVIDERS: DORYAB: - COMPUTE: True + COMPUTE: False FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"] DBSCAN_EPS: 100 # meters DBSCAN_MINSAMPLES: 5 @@ -291,7 +290,7 @@ PHONE_LOCATIONS: SRC_SCRIPT: src/features/phone_locations/doryab/main.py BARNETT: - COMPUTE: True + COMPUTE: False FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features @@ -309,7 +308,7 @@ PHONE_MESSAGES: CONTAINER: sms PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False MESSAGES_TYPES : [received, sent] FEATURES: received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] @@ -321,7 +320,7 @@ PHONE_SCREEN: CONTAINER: screen PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False REFERENCE_HOUR_FIRST_USE: 0 IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable @@ -334,7 +333,7 @@ PHONE_SPEECH: CONTAINER: speech PROVIDERS: STRAW: - COMPUTE: True + COMPUTE: False FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] SRC_SCRIPT: src/features/phone_speech/straw/main.py @@ -352,7 +351,7 @@ PHONE_WIFI_VISIBLE: CONTAINER: wifi PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R @@ -521,10 +520,10 @@ EMPATICA_ACCELEROMETER: FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features WINDOWS: - COMPUTE: True + COMPUTE: False WINDOW_LENGTH: 15 # specify window length in seconds SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'] SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py @@ -548,11 +547,11 @@ EMPATICA_TEMPERATURE: FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", "stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"] WINDOWS: - COMPUTE: True + COMPUTE: False WINDOW_LENGTH: 300 # specify window length in seconds SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'] SRC_SCRIPT: src/features/empatica_temperature/cr/main.py @@ -566,14 +565,14 @@ EMPATICA_ELECTRODERMAL_ACTIVITY: FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"] SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic', 'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore', 'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio', 'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease', 'significantDecrease'] WINDOWS: - COMPUTE: True + COMPUTE: False WINDOW_LENGTH: 60 # specify window length in seconds SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero] IMPUTE_NANS: True @@ -592,7 +591,7 @@ EMPATICA_BLOOD_VOLUME_PULSE: FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features WINDOWS: - COMPUTE: True + COMPUTE: False WINDOW_LENGTH: 300 # specify window length in seconds SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan'] SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py @@ -606,12 +605,12 @@ EMPATICA_INTER_BEAT_INTERVAL: FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"] SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features PATCH_WITH_BVP: True WINDOWS: - COMPUTE: True + COMPUTE: False WINDOW_LENGTH: 300 # specify window length in seconds SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan'] SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py @@ -732,7 +731,7 @@ ALL_CLEANING_OVERALL: PARAMS_FOR_ANALYSIS: BASELINE: - COMPUTE: True + COMPUTE: False FOLDER: data/external/baseline CONTAINER: [results-survey637813_final.csv, # Slovenia results-survey358134_final.csv, # Belgium 1 @@ -743,7 +742,7 @@ PARAMS_FOR_ANALYSIS: CATEGORICAL_FEATURES: [gender] TARGET: - COMPUTE: True + COMPUTE: False LABEL: appraisal_stressfulness_event_mean ALL_LABELS: [appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean] # PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, diff --git a/src/features/phone_esm/straw/esm_activities.py b/src/features/phone_esm/straw/esm_activities.py new file mode 100644 index 00000000..38f166bd --- /dev/null +++ b/src/features/phone_esm/straw/esm_activities.py @@ -0,0 +1,288 @@ +import pandas as pd +import numpy as np + +id2qc = { 44:["What have you mainly been doing within the last 10 minutes?", + "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?", + "Kaj ste v glavnem počeli v zadnjih 10 minutah?"], + 45:["What type of individual work?", + "Wat voor soort individueel werk?", + "Kakšno vrsto samostojnega dela ste opravljali?"], + 46:["How did you work with others?", + "Hoe heb je met anderen gewerkt?", + "Kako ste sodelovali z drugimi?"], + 47:["What type of break?", + "Wat voor soort pauze?", + "Kakšno vrsto odmora ste imeli?"], + 48:["Where did you travel between?", + "Waar heb je tussen gereisd?", + "Kam ste potovali?"], + 49:["Did you use a computer or phone for that?", + "Heb je daarvoor een computer of telefoon gebruikt?", + "Ste za to uporabljali računalnik ali telefon?"], + 50:["What kind of an interaction was that?", + "Wat voor interactie was dat?", + "Kakšne vrste sodelovanja je bilo to?"], + 51:["How many people were involved besides yourself?", + "Hoeveel mensen waren er behalve jezelf betrokken?", + "Koliko oseb je bilo poleg vas še vpletenih?"], + # 52:["What have you mainly been doing within the last 10 minutes?", + # "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?", + # "Kaj ste v glavnem počeli v zadnjih 10 minutah?"] +} +qc2id = {v:k for k,values in id2qc.items() for v in values} + +next_questions = { 44: [45,46,47,48], + 45:[49,49], + 46:[50,50], + 47:[], + 48:[], + 49:[], + 50:[51,51], + 51:[] + #52:[45,46,47,48], + } + +def esm_activities_LTM_features( + df_esm_activities_cleaned: pd.DataFrame, +) -> pd.DataFrame: + """ Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according + to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to + find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person. + + Parameters + ---------- + df_esm_activities_cleaned: pd.DataFrame + A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric. + + Returns + ------- + df_esm_activities_cleaned: pd.DataFrame + The same dataframe with columns which contain: + ["correct_ids"] - Corrected question_ids + ["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions. + ["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers(). + """ + #TODO: preprocess questionaires + #DONE: correct ids + correct_id_df = correct_activity_qids(df_esm_activities_cleaned) + #DONE: process subquestions + ids = correct_id_df["correct_ids"] + main_q_indices = ids[ids==44].index + q_group = [] + i=-1 + for id in ids: + if(id==44): + i=i+1 + q_group.append(i) + correct_id_df["q_group"] = q_group + ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"}) + ans_seq.set_index(main_q_indices,inplace=True) + # correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))] + # correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1) + #DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown] + processed_ans_df = process_answers(ans_seq) + # df_out = df_esm_activities_cleaned.join(test) + return df_esm_activities_cleaned.join(processed_ans_df) + + +""" +possible answer sequences for LTM question chains + +#alone +0,0,0 not social +0,0,1 not social +0,1,0 not social +0,1,1 not social +0,2 not social +0,3 not social +0,4 not social +0,5 not social +0,6 not social +#w/ others +1,0,0,0 1 irl +1,0,0,1 2 irl +1,0,0,2 3+ irl +1,0,1,0 1 irl +1,0,1,1 2 irl +1,0,1,2 3+ irl +1,1,0,0 1 online +1,1,0,1 2 online +1,1,0,2 3+ online +1,1,1,0 1 online +1,1,1,1 2 online +1,1,1,2 3+ online +1,2 positive likely to be more than 2 +1,3 positive +#break +2,0 ambiguous +2,1 positive irl +2,2 ambiguous +2,3 ambiguous +#transit +3,0 ambiguous +3,1 ambiguous +3,2 ambiguous +""" + + + + +#TODO: docstring +def process_answers(df:pd.DataFrame)-> pd.DataFrame: + """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes: + > n_others: Number of other people interacted with in the last 10 minutes + - -1: Number is positive but unknown exactly + - 0: No people/alone + - 1: One extra person + - 2: Two extra people + - 3: More than two extra people + - NaN : Can't say anything with enough certainty. + > inperson: + - True/False: The interaction in question was/wasn't in person. + - None: Can't say anything with enough certainty. + > formal: + - True/False: The interaction in question was/wasn't formal. + - None: Can't say anything with enough certainty. + Args: + df (pd.DataFrame): _description_ + + Returns: + pd.DataFrame: _description_ + """ + properties = {"n_others":[], + "inperson":[], + "formal":[]} + for ans_seq in df["ans_seq"]: + n_other = None + inperson = None + formal = None + if(ans_seq[0]==0): + n_other = 0 + elif(ans_seq[0]==1): + if(ans_seq[1]==3): + n_other = -1 # anwsered "Other" but did work with other people + elif(ans_seq[1]==2): + n_other = 3 #assuming more than 2 people participated in the lecture or presentation + elif(ans_seq[1] in [0,1]): + inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone + formal = ans_seq[2]==0#0 means formal + n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3] + elif(ans_seq[0]==2): + formal = False#assuming one does not have a formal meeting during break time + if(ans_seq[1]==1): + n_other = -1 + inperson = True + #if not 1 then we dont know anythong for sure + elif(ans_seq[0]==3): + #we cant say whether the persion was carpooling or driving alone. + pass + properties["n_others"].append(n_other) + properties["inperson"].append(inperson) + properties["formal"].append(formal) + + + #df = df.join(pd.DataFrame(properties,index=df.index)) + return pd.DataFrame(properties,index=df.index) + +def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame: + """_summary_ + + Args: + df (pd.DataFrame): _description_ + + Returns: + pd.DataFrame: Input dataframe with added column "correct_ids" + """ + df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x]) + return df + + + +def process_answers_aggregation(df:pd.DataFrame)-> pd.DataFrame: + """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes: + > n_others: Number of other people interacted with in the last 10 minutes + - -1: Number is positive but unknown exactly + - 0: No people/alone + - 1: One extra person + - 2: Two extra people + - 3: More than two extra people + - NaN : Can't say anything with enough certainty. + > inperson: + - True/False: The interaction in question was/wasn't in person. + - None: Can't say anything with enough certainty. + > formal: + - True/False: The interaction in question was/wasn't formal. + - None: Can't say anything with enough certainty. + Args: + df (pd.DataFrame): _description_ + + Returns: + pd.DataFrame: _description_ + """ + properties = {"n_others":[], + "inperson":[], + "formal":[]} + ans_seq = df["esm_user_answer_numeric"].values + n_other = None + inperson = None + formal = None + if(ans_seq[0]==0): + n_other = 0 + elif(ans_seq[0]==1): + if(ans_seq[1]==3): + n_other = -1 # anwsered "Other" but did work with other people + elif(ans_seq[1]==2): + n_other = 3 #assuming more than 2 people participated in the lecture or presentation + elif(ans_seq[1] in [0,1]): + inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone + formal = ans_seq[2]==0#0 means formal + n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3] + elif(ans_seq[0]==2): + formal = False#assuming one does not have a formal meeting during break time + if(ans_seq[1]==1): + n_other = -1 + inperson = True + #if not 1 then we dont know anythong for sure + elif(ans_seq[0]==3): + #we cant say whether the persion was carpooling or driving alone. + pass + properties["n_others"].append(n_other) + properties["inperson"].append(inperson) + properties["formal"].append(formal) + + + #df = df.join(pd.DataFrame(properties,index=df.index)) + return pd.DataFrame(properties,index=df.index) + + + +#test stuff +def test(): + from esm_preprocess import preprocess_esm,clean_up_esm + df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv") + df = preprocess_esm(df) + df = clean_up_esm(df) + df = df[df["questionnaire_id"]==97] + original = esm_activities_LTM_features(df) + df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values] + temp = df.groupby("local_segment") + temp2 = temp.apply(process_answers_aggregation) + + #compare with original function results + selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]] + temp_selection = temp2.loc[selection.index] + temp_selection.compare(selection,keep_shape=True,keep_equal =True) + + #print out ans_seq processing results + # import json + # i = 0 + # for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values: + # obj = json.loads(j) + # text = obj["esm_instructions"] + # if ("10 minut" in text): + # print("---\n",test.ans_seq.iloc[i]) + # print(test[["n_others","inperson","formal"]].values[i]) + # i = i+1 + # print(text,ans) + +#test() \ No newline at end of file diff --git a/src/features/phone_esm/straw/main.py b/src/features/phone_esm/straw/main.py index 8a55b8eb..6729137a 100644 --- a/src/features/phone_esm/straw/main.py +++ b/src/features/phone_esm/straw/main.py @@ -1,4 +1,8 @@ import pandas as pd +import sys +import warnings +sys.path.append('src/features/phone_esm/straw') +from esm_activities import esm_activities_LTM_features,process_answers_aggregation QUESTIONNAIRE_IDS = { "sleep_quality": 1, @@ -39,23 +43,49 @@ QUESTIONNAIRE_IDS = { def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): esm_data = pd.read_csv(sensor_data_files["sensor_data"]) + requested_features = provider["FEATURES"] # name of the features this function can compute requested_scales = provider["SCALES"] base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", - "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] + "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge","activities_n_others","activities_inperson","activities_formal"] #TODO Check valid questionnaire and feature names. # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not esm_data.empty: + # print(esm_data.head()) + # print(time_segment) esm_data = filter_data_by_segment(esm_data, time_segment) - if not esm_data.empty: esm_features = pd.DataFrame() for scale in requested_scales: questionnaire_id = QUESTIONNAIRE_IDS[scale] mask = esm_data["questionnaire_id"] == questionnaire_id + if not mask.any(): + temp = sensor_data_files["sensor_data"] + warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}",RuntimeWarning) + continue + #TODO: calculation of LTM features + if scale=="activities": + requested_subset = [req[len("activities_"):] for req in requested_features if req.startswith("activities")] + if not bool(requested_subset): + continue + # ltm_features = esm_activities_LTM_features(esm_data.loc[mask]) + # print(esm_data["esm_json"].values) + # print(mask) + # print(esm_data.loc[mask]) + # print(ltm_features) + # #ltm_features = ltm_features[ltm_features["correct_ids"==44]] + print(esm_data["local_segment"]) + if(type(esm_data["local_segment"].values[0]) != str): + raise Exception("wrong dtype of local_segment") + ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation) + print(ltm_features) + esm_features[["activities_"+req for req in requested_subset]] = ltm_features[requested_subset] + #FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment" + continue + esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean() #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing. diff --git a/src/features/phone_esm/straw/process_user_event_related_segments.py b/src/features/phone_esm/straw/process_user_event_related_segments.py index 03eeb052..c737d461 100644 --- a/src/features/phone_esm/straw/process_user_event_related_segments.py +++ b/src/features/phone_esm/straw/process_user_event_related_segments.py @@ -67,7 +67,7 @@ def extract_ers(esm_df): segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] - if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire + if segmenting_method in ["10_before", "30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire """ '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below. Both take x-minute period before the questionnaire that is summed with the questionnaire duration. All questionnaire durations over 15 minutes are excluded from the querying. @@ -79,7 +79,18 @@ def extract_ers(esm_df): extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min extracted_ers["shift_direction"] = -1 - if segmenting_method == "30_before": + if segmenting_method == "10_before": + """The method 10-minutes before simply takes 10 minutes before the questionnaire and sums it with the questionnaire duration. + The timestamps are formatted with the help of format_timestamp() method. + """ + time_before_questionnaire = 10 * 60 # in seconds (10 minutes) + #TODO: split into small segments with manipulating lenght and shift + extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x)) + extracted_ers["shift"] = time_before_questionnaire + extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x)) + + + elif segmenting_method == "30_before": """The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration. The timestamps are formatted with the help of format_timestamp() method. """ diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 8a4d2130..1b632e11 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -14,6 +14,7 @@ def import_path(path): sys.modules[module_name] = module return module +#TODO:check why segments change to int def filter_data_by_segment(data, time_segment): data.dropna(subset=["assigned_segments"], inplace=True) if(data.shape[0] == 0): # data is empty @@ -151,6 +152,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file else: segment_colums = pd.DataFrame() + print(sensor_features,sensor_features['local_segment']) sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '') split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])