import pandas as pd import numpy as np id2qc = { 44:["What have you mainly been doing within the last 10 minutes?", "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?", "Kaj ste v glavnem počeli v zadnjih 10 minutah?"], 45:["What type of individual work?", "Wat voor soort individueel werk?", "Kakšno vrsto samostojnega dela ste opravljali?"], 46:["How did you work with others?", "Hoe heb je met anderen gewerkt?", "Kako ste sodelovali z drugimi?"], 47:["What type of break?", "Wat voor soort pauze?", "Kakšno vrsto odmora ste imeli?"], 48:["Where did you travel between?", "Waar heb je tussen gereisd?", "Kam ste potovali?"], 49:["Did you use a computer or phone for that?", "Heb je daarvoor een computer of telefoon gebruikt?", "Ste za to uporabljali računalnik ali telefon?"], 50:["What kind of an interaction was that?", "Wat voor interactie was dat?", "Kakšne vrste sodelovanja je bilo to?"], 51:["How many people were involved besides yourself?", "Hoeveel mensen waren er behalve jezelf betrokken?", "Koliko oseb je bilo poleg vas še vpletenih?"], # 52:["What have you mainly been doing within the last 10 minutes?", # "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?", # "Kaj ste v glavnem počeli v zadnjih 10 minutah?"] } qc2id = {v:k for k,values in id2qc.items() for v in values} next_questions = { 44: [45,46,47,48], 45:[49,49], 46:[50,50], 47:[], 48:[], 49:[], 50:[51,51], 51:[] #52:[45,46,47,48], } def esm_activities_LTM_features( df_esm_activities_cleaned: pd.DataFrame, ) -> pd.DataFrame: """ Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person. Parameters ---------- df_esm_activities_cleaned: pd.DataFrame A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric. Returns ------- df_esm_activities_cleaned: pd.DataFrame The same dataframe with columns which contain: ["correct_ids"] - Corrected question_ids ["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions. ["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers(). """ #TODO: preprocess questionaires #DONE: correct ids correct_id_df = correct_activity_qids(df_esm_activities_cleaned) #DONE: process subquestions ids = correct_id_df["correct_ids"] main_q_indices = ids[ids==44].index q_group = [] i=-1 for id in ids: if(id==44): i=i+1 q_group.append(i) correct_id_df["q_group"] = q_group ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"}) ans_seq.set_index(main_q_indices,inplace=True) # correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))] # correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1) #DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown] processed_ans_df = process_answers(ans_seq) # df_out = df_esm_activities_cleaned.join(test) return df_esm_activities_cleaned.join(processed_ans_df) """ possible answer sequences for LTM question chains #alone 0,0,0 not social 0,0,1 not social 0,1,0 not social 0,1,1 not social 0,2 not social 0,3 not social 0,4 not social 0,5 not social 0,6 not social #w/ others 1,0,0,0 1 irl 1,0,0,1 2 irl 1,0,0,2 3+ irl 1,0,1,0 1 irl 1,0,1,1 2 irl 1,0,1,2 3+ irl 1,1,0,0 1 online 1,1,0,1 2 online 1,1,0,2 3+ online 1,1,1,0 1 online 1,1,1,1 2 online 1,1,1,2 3+ online 1,2 positive likely to be more than 2 1,3 positive #break 2,0 ambiguous 2,1 positive irl 2,2 ambiguous 2,3 ambiguous #transit 3,0 ambiguous 3,1 ambiguous 3,2 ambiguous """ #TODO: docstring def process_answers(df:pd.DataFrame)-> pd.DataFrame: """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes: > n_others: Number of other people interacted with in the last 10 minutes - -1: Number is positive but unknown exactly - 0: No people/alone - 1: One extra person - 2: Two extra people - 3: More than two extra people - NaN : Can't say anything with enough certainty. > inperson: - True/False: The interaction in question was/wasn't in person. - None: Can't say anything with enough certainty. > formal: - True/False: The interaction in question was/wasn't formal. - None: Can't say anything with enough certainty. Args: df (pd.DataFrame): _description_ Returns: pd.DataFrame: _description_ """ properties = {"n_others":[], "inperson":[], "formal":[]} for ans_seq in df["ans_seq"]: n_other = None inperson = None formal = None if(ans_seq[0]==0): n_other = 0 elif(ans_seq[0]==1): if(ans_seq[1]==3): n_other = -1 # anwsered "Other" but did work with other people elif(ans_seq[1]==2): n_other = 3 #assuming more than 2 people participated in the lecture or presentation elif(ans_seq[1] in [0,1]): inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone formal = ans_seq[2]==0#0 means formal n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3] elif(ans_seq[0]==2): formal = False#assuming one does not have a formal meeting during break time if(ans_seq[1]==1): n_other = -1 inperson = True #if not 1 then we dont know anythong for sure elif(ans_seq[0]==3): #we cant say whether the persion was carpooling or driving alone. pass properties["n_others"].append(n_other) properties["inperson"].append(inperson) properties["formal"].append(formal) #df = df.join(pd.DataFrame(properties,index=df.index)) return pd.DataFrame(properties,index=df.index) def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame: """_summary_ Args: df (pd.DataFrame): _description_ Returns: pd.DataFrame: Input dataframe with added column "correct_ids" """ df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x]) return df def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy: """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes: > n_others: Number of other people interacted with in the last 10 minutes - -1: Number is positive but unknown exactly - 0: No people/alone - 1: One extra person - 2: Two extra people - 3: More than two extra people - NaN : Can't say anything with enough certainty. > inperson: - True/False: The interaction in question was/wasn't in person. - None: Can't say anything with enough certainty. > formal: - True/False: The interaction in question was/wasn't formal. - None: Can't say anything with enough certainty. Args: df (pd.DataFrame): _description_ Returns: pd.DataFrame: _description_ """ #print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment) properties = {"n_others":[], "inperson":[], "formal":[]} ans_seq = df["esm_user_answer_numeric"].values n_other = None inperson = None formal = None if(ans_seq[0]==0): n_other = 0 elif(ans_seq[0]==1): if(ans_seq[1]==3): n_other = -1 # anwsered "Other" but did work with other people elif(ans_seq[1]==2): n_other = 3 #assuming more than 2 people participated in the lecture or presentation elif(ans_seq[1] in [0,1]): inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone formal = ans_seq[2]==0#0 means formal n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3] elif(ans_seq[0]==2): formal = False#assuming one does not have a formal meeting during break time if(ans_seq[1]==1): n_other = -1 inperson = True #if not 1 then we dont know anythong for sure elif(ans_seq[0]==3): #we cant say whether the persion was carpooling or driving alone. pass properties["n_others"].append(n_other) properties["inperson"].append(inperson) properties["formal"].append(formal) df = df.join(pd.DataFrame(properties,index=df.index)) #print("APPLY END:\ndf=",df[["n_others","inperson","formal"]]) return df #test stuff def test(): from esm_preprocess import preprocess_esm,clean_up_esm df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv") df = preprocess_esm(df) df = clean_up_esm(df) df = df[df["questionnaire_id"]==97] original = esm_activities_LTM_features(df) df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values] temp = df.groupby("local_segment") temp2 = temp.apply(process_answers_aggregation) #compare with original function results selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]] temp_selection = temp2.loc[selection.index] temp_selection.compare(selection,keep_shape=True,keep_equal =True) #print out ans_seq processing results # import json # i = 0 # for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values: # obj = json.loads(j) # text = obj["esm_instructions"] # if ("10 minut" in text): # print("---\n",test.ans_seq.iloc[i]) # print(test[["n_others","inperson","formal"]].values[i]) # i = i+1 # print(text,ans) #test()