rapids/src/features/phone_esm/straw/esm_activities.py

292 lines
11 KiB
Python

import pandas as pd
import numpy as np
id2qc = { 44:["What have you mainly been doing within the last 10 minutes?",
"Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
"Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
45:["What type of individual work?",
"Wat voor soort individueel werk?",
"Kakšno vrsto samostojnega dela ste opravljali?"],
46:["How did you work with others?",
"Hoe heb je met anderen gewerkt?",
"Kako ste sodelovali z drugimi?"],
47:["What type of break?",
"Wat voor soort pauze?",
"Kakšno vrsto odmora ste imeli?"],
48:["Where did you travel between?",
"Waar heb je tussen gereisd?",
"Kam ste potovali?"],
49:["Did you use a computer or phone for that?",
"Heb je daarvoor een computer of telefoon gebruikt?",
"Ste za to uporabljali računalnik ali telefon?"],
50:["What kind of an interaction was that?",
"Wat voor interactie was dat?",
"Kakšne vrste sodelovanja je bilo to?"],
51:["How many people were involved besides yourself?",
"Hoeveel mensen waren er behalve jezelf betrokken?",
"Koliko oseb je bilo poleg vas še vpletenih?"],
# 52:["What have you mainly been doing within the last 10 minutes?",
# "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
# "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
}
qc2id = {v:k for k,values in id2qc.items() for v in values}
next_questions = { 44: [45,46,47,48],
45:[49,49],
46:[50,50],
47:[],
48:[],
49:[],
50:[51,51],
51:[]
#52:[45,46,47,48],
}
def esm_activities_LTM_features(
df_esm_activities_cleaned: pd.DataFrame,
) -> pd.DataFrame:
""" Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to
find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.
Parameters
----------
df_esm_activities_cleaned: pd.DataFrame
A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.
Returns
-------
df_esm_activities_cleaned: pd.DataFrame
The same dataframe with columns which contain:
["correct_ids"] - Corrected question_ids
["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
"""
#TODO: preprocess questionaires
#DONE: correct ids
correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
#DONE: process subquestions
ids = correct_id_df["correct_ids"]
main_q_indices = ids[ids==44].index
q_group = []
i=-1
for id in ids:
if(id==44):
i=i+1
q_group.append(i)
correct_id_df["q_group"] = q_group
ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
ans_seq.set_index(main_q_indices,inplace=True)
# correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
# correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
#DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
processed_ans_df = process_answers(ans_seq)
# df_out = df_esm_activities_cleaned.join(test)
return df_esm_activities_cleaned.join(processed_ans_df)
"""
possible answer sequences for LTM question chains
#alone
0,0,0 not social
0,0,1 not social
0,1,0 not social
0,1,1 not social
0,2 not social
0,3 not social
0,4 not social
0,5 not social
0,6 not social
#w/ others
1,0,0,0 1 irl
1,0,0,1 2 irl
1,0,0,2 3+ irl
1,0,1,0 1 irl
1,0,1,1 2 irl
1,0,1,2 3+ irl
1,1,0,0 1 online
1,1,0,1 2 online
1,1,0,2 3+ online
1,1,1,0 1 online
1,1,1,1 2 online
1,1,1,2 3+ online
1,2 positive likely to be more than 2
1,3 positive
#break
2,0 ambiguous
2,1 positive irl
2,2 ambiguous
2,3 ambiguous
#transit
3,0 ambiguous
3,1 ambiguous
3,2 ambiguous
"""
#TODO: docstring
def process_answers(df:pd.DataFrame)-> pd.DataFrame:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
- 0: No people/alone
- 1: One extra person
- 2: Two extra people
- 3: More than two extra people
- NaN : Can't say anything with enough certainty.
> inperson:
- True/False: The interaction in question was/wasn't in person.
- None: Can't say anything with enough certainty.
> formal:
- True/False: The interaction in question was/wasn't formal.
- None: Can't say anything with enough certainty.
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
properties = {"n_others":[],
"inperson":[],
"formal":[]}
for ans_seq in df["ans_seq"]:
n_other = None
inperson = None
formal = None
if(ans_seq[0]==0):
n_other = 0
elif(ans_seq[0]==1):
if(ans_seq[1]==3):
n_other = -1 # anwsered "Other" but did work with other people
elif(ans_seq[1]==2):
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
elif(ans_seq[1] in [0,1]):
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
formal = ans_seq[2]==0#0 means formal
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
elif(ans_seq[0]==2):
formal = False#assuming one does not have a formal meeting during break time
if(ans_seq[1]==1):
n_other = -1
inperson = True
#if not 1 then we dont know anythong for sure
elif(ans_seq[0]==3):
#we cant say whether the persion was carpooling or driving alone.
pass
properties["n_others"].append(n_other)
properties["inperson"].append(inperson)
properties["formal"].append(formal)
#df = df.join(pd.DataFrame(properties,index=df.index))
return pd.DataFrame(properties,index=df.index)
def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
"""_summary_
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: Input dataframe with added column "correct_ids"
"""
df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
return df
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
- 0: No people/alone
- 1: One extra person
- 2: Two extra people
- 3: More than two extra people
- NaN : Can't say anything with enough certainty.
> inperson:
- True/False: The interaction in question was/wasn't in person.
- None: Can't say anything with enough certainty.
> formal:
- True/False: The interaction in question was/wasn't formal.
- None: Can't say anything with enough certainty.
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
properties = {"n_others":[],
"inperson":[],
"formal":[]}
ans_seq = df["esm_user_answer_numeric"].values
n_other = None
inperson = None
formal = None
if(ans_seq[0]==0):
n_other = 0
elif(ans_seq[0]==1):
if(ans_seq[1]==3):
n_other = -1 # anwsered "Other" but did work with other people
elif(ans_seq[1]==2):
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
elif(ans_seq[1] in [0,1]):
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
formal = ans_seq[2]==0#0 means formal
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
elif(ans_seq[0]==2):
formal = False#assuming one does not have a formal meeting during break time
if(ans_seq[1]==1):
n_other = -1
inperson = True
#if not 1 then we dont know anythong for sure
elif(ans_seq[0]==3):
#we cant say whether the persion was carpooling or driving alone.
pass
properties["n_others"].append(n_other)
properties["inperson"].append(inperson)
properties["formal"].append(formal)
df = df.join(pd.DataFrame(properties,index=df.index))
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
return df
#test stuff
def test():
from esm_preprocess import preprocess_esm,clean_up_esm
df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
df = preprocess_esm(df)
df = clean_up_esm(df)
df = df[df["questionnaire_id"]==97]
original = esm_activities_LTM_features(df)
df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
temp = df.groupby("local_segment")
temp2 = temp.apply(process_answers_aggregation)
#compare with original function results
selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]]
temp_selection = temp2.loc[selection.index]
temp_selection.compare(selection,keep_shape=True,keep_equal =True)
#print out ans_seq processing results
# import json
# i = 0
# for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
# obj = json.loads(j)
# text = obj["esm_instructions"]
# if ("10 minut" in text):
# print("---\n",test.ans_seq.iloc[i])
# print(test[["n_others","inperson","formal"]].values[i])
# i = i+1
# print(text,ans)
#test()