292 lines
11 KiB
Python
292 lines
11 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
|
|
id2qc = { 44:["What have you mainly been doing within the last 10 minutes?",
|
|
"Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
|
|
"Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
|
|
45:["What type of individual work?",
|
|
"Wat voor soort individueel werk?",
|
|
"Kakšno vrsto samostojnega dela ste opravljali?"],
|
|
46:["How did you work with others?",
|
|
"Hoe heb je met anderen gewerkt?",
|
|
"Kako ste sodelovali z drugimi?"],
|
|
47:["What type of break?",
|
|
"Wat voor soort pauze?",
|
|
"Kakšno vrsto odmora ste imeli?"],
|
|
48:["Where did you travel between?",
|
|
"Waar heb je tussen gereisd?",
|
|
"Kam ste potovali?"],
|
|
49:["Did you use a computer or phone for that?",
|
|
"Heb je daarvoor een computer of telefoon gebruikt?",
|
|
"Ste za to uporabljali računalnik ali telefon?"],
|
|
50:["What kind of an interaction was that?",
|
|
"Wat voor interactie was dat?",
|
|
"Kakšne vrste sodelovanja je bilo to?"],
|
|
51:["How many people were involved besides yourself?",
|
|
"Hoeveel mensen waren er behalve jezelf betrokken?",
|
|
"Koliko oseb je bilo poleg vas še vpletenih?"],
|
|
# 52:["What have you mainly been doing within the last 10 minutes?",
|
|
# "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
|
|
# "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
|
|
}
|
|
qc2id = {v:k for k,values in id2qc.items() for v in values}
|
|
|
|
next_questions = { 44: [45,46,47,48],
|
|
45:[49,49],
|
|
46:[50,50],
|
|
47:[],
|
|
48:[],
|
|
49:[],
|
|
50:[51,51],
|
|
51:[]
|
|
#52:[45,46,47,48],
|
|
}
|
|
|
|
def esm_activities_LTM_features(
|
|
df_esm_activities_cleaned: pd.DataFrame,
|
|
) -> pd.DataFrame:
|
|
""" Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
|
|
to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to
|
|
find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.
|
|
|
|
Parameters
|
|
----------
|
|
df_esm_activities_cleaned: pd.DataFrame
|
|
A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.
|
|
|
|
Returns
|
|
-------
|
|
df_esm_activities_cleaned: pd.DataFrame
|
|
The same dataframe with columns which contain:
|
|
["correct_ids"] - Corrected question_ids
|
|
["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
|
|
["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
|
|
"""
|
|
#TODO: preprocess questionaires
|
|
#DONE: correct ids
|
|
correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
|
|
#DONE: process subquestions
|
|
ids = correct_id_df["correct_ids"]
|
|
main_q_indices = ids[ids==44].index
|
|
q_group = []
|
|
i=-1
|
|
for id in ids:
|
|
if(id==44):
|
|
i=i+1
|
|
q_group.append(i)
|
|
correct_id_df["q_group"] = q_group
|
|
ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
|
|
ans_seq.set_index(main_q_indices,inplace=True)
|
|
# correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
|
|
# correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
|
|
#DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
|
|
processed_ans_df = process_answers(ans_seq)
|
|
# df_out = df_esm_activities_cleaned.join(test)
|
|
return df_esm_activities_cleaned.join(processed_ans_df)
|
|
|
|
|
|
"""
|
|
possible answer sequences for LTM question chains
|
|
|
|
#alone
|
|
0,0,0 not social
|
|
0,0,1 not social
|
|
0,1,0 not social
|
|
0,1,1 not social
|
|
0,2 not social
|
|
0,3 not social
|
|
0,4 not social
|
|
0,5 not social
|
|
0,6 not social
|
|
#w/ others
|
|
1,0,0,0 1 irl
|
|
1,0,0,1 2 irl
|
|
1,0,0,2 3+ irl
|
|
1,0,1,0 1 irl
|
|
1,0,1,1 2 irl
|
|
1,0,1,2 3+ irl
|
|
1,1,0,0 1 online
|
|
1,1,0,1 2 online
|
|
1,1,0,2 3+ online
|
|
1,1,1,0 1 online
|
|
1,1,1,1 2 online
|
|
1,1,1,2 3+ online
|
|
1,2 positive likely to be more than 2
|
|
1,3 positive
|
|
#break
|
|
2,0 ambiguous
|
|
2,1 positive irl
|
|
2,2 ambiguous
|
|
2,3 ambiguous
|
|
#transit
|
|
3,0 ambiguous
|
|
3,1 ambiguous
|
|
3,2 ambiguous
|
|
"""
|
|
|
|
|
|
|
|
|
|
#TODO: docstring
|
|
def process_answers(df:pd.DataFrame)-> pd.DataFrame:
|
|
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
|
|
> n_others: Number of other people interacted with in the last 10 minutes
|
|
- -1: Number is positive but unknown exactly
|
|
- 0: No people/alone
|
|
- 1: One extra person
|
|
- 2: Two extra people
|
|
- 3: More than two extra people
|
|
- NaN : Can't say anything with enough certainty.
|
|
> inperson:
|
|
- True/False: The interaction in question was/wasn't in person.
|
|
- None: Can't say anything with enough certainty.
|
|
> formal:
|
|
- True/False: The interaction in question was/wasn't formal.
|
|
- None: Can't say anything with enough certainty.
|
|
Args:
|
|
df (pd.DataFrame): _description_
|
|
|
|
Returns:
|
|
pd.DataFrame: _description_
|
|
"""
|
|
properties = {"n_others":[],
|
|
"inperson":[],
|
|
"formal":[]}
|
|
for ans_seq in df["ans_seq"]:
|
|
n_other = None
|
|
inperson = None
|
|
formal = None
|
|
if(ans_seq[0]==0):
|
|
n_other = 0
|
|
elif(ans_seq[0]==1):
|
|
if(ans_seq[1]==3):
|
|
n_other = -1 # anwsered "Other" but did work with other people
|
|
elif(ans_seq[1]==2):
|
|
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
|
|
elif(ans_seq[1] in [0,1]):
|
|
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
|
|
formal = ans_seq[2]==0#0 means formal
|
|
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
|
|
elif(ans_seq[0]==2):
|
|
formal = False#assuming one does not have a formal meeting during break time
|
|
if(ans_seq[1]==1):
|
|
n_other = -1
|
|
inperson = True
|
|
#if not 1 then we dont know anythong for sure
|
|
elif(ans_seq[0]==3):
|
|
#we cant say whether the persion was carpooling or driving alone.
|
|
pass
|
|
properties["n_others"].append(n_other)
|
|
properties["inperson"].append(inperson)
|
|
properties["formal"].append(formal)
|
|
|
|
|
|
#df = df.join(pd.DataFrame(properties,index=df.index))
|
|
return pd.DataFrame(properties,index=df.index)
|
|
|
|
def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
|
|
"""_summary_
|
|
|
|
Args:
|
|
df (pd.DataFrame): _description_
|
|
|
|
Returns:
|
|
pd.DataFrame: Input dataframe with added column "correct_ids"
|
|
"""
|
|
df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
|
|
return df
|
|
|
|
|
|
|
|
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
|
|
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
|
|
> n_others: Number of other people interacted with in the last 10 minutes
|
|
- -1: Number is positive but unknown exactly
|
|
- 0: No people/alone
|
|
- 1: One extra person
|
|
- 2: Two extra people
|
|
- 3: More than two extra people
|
|
- NaN : Can't say anything with enough certainty.
|
|
> inperson:
|
|
- True/False: The interaction in question was/wasn't in person.
|
|
- None: Can't say anything with enough certainty.
|
|
> formal:
|
|
- True/False: The interaction in question was/wasn't formal.
|
|
- None: Can't say anything with enough certainty.
|
|
Args:
|
|
df (pd.DataFrame): _description_
|
|
|
|
Returns:
|
|
pd.DataFrame: _description_
|
|
"""
|
|
|
|
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
|
|
properties = {"n_others":[],
|
|
"inperson":[],
|
|
"formal":[]}
|
|
ans_seq = df["esm_user_answer_numeric"].values
|
|
n_other = None
|
|
inperson = None
|
|
formal = None
|
|
if(ans_seq[0]==0):
|
|
n_other = 0
|
|
elif(ans_seq[0]==1):
|
|
if(ans_seq[1]==3):
|
|
n_other = -1 # anwsered "Other" but did work with other people
|
|
elif(ans_seq[1]==2):
|
|
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
|
|
elif(ans_seq[1] in [0,1]):
|
|
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
|
|
formal = ans_seq[2]==0#0 means formal
|
|
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
|
|
elif(ans_seq[0]==2):
|
|
formal = False#assuming one does not have a formal meeting during break time
|
|
if(ans_seq[1]==1):
|
|
n_other = -1
|
|
inperson = True
|
|
#if not 1 then we dont know anythong for sure
|
|
elif(ans_seq[0]==3):
|
|
#we cant say whether the persion was carpooling or driving alone.
|
|
pass
|
|
properties["n_others"].append(n_other)
|
|
properties["inperson"].append(inperson)
|
|
properties["formal"].append(formal)
|
|
|
|
|
|
df = df.join(pd.DataFrame(properties,index=df.index))
|
|
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
|
|
|
|
return df
|
|
|
|
|
|
|
|
#test stuff
|
|
def test():
|
|
from esm_preprocess import preprocess_esm,clean_up_esm
|
|
df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
|
|
df = preprocess_esm(df)
|
|
df = clean_up_esm(df)
|
|
df = df[df["questionnaire_id"]==97]
|
|
original = esm_activities_LTM_features(df)
|
|
df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
|
|
temp = df.groupby("local_segment")
|
|
temp2 = temp.apply(process_answers_aggregation)
|
|
|
|
#compare with original function results
|
|
selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]]
|
|
temp_selection = temp2.loc[selection.index]
|
|
temp_selection.compare(selection,keep_shape=True,keep_equal =True)
|
|
|
|
#print out ans_seq processing results
|
|
# import json
|
|
# i = 0
|
|
# for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
|
|
# obj = json.loads(j)
|
|
# text = obj["esm_instructions"]
|
|
# if ("10 minut" in text):
|
|
# print("---\n",test.ans_seq.iloc[i])
|
|
# print(test[["n_others","inperson","formal"]].values[i])
|
|
# i = i+1
|
|
# print(text,ans)
|
|
|
|
#test() |