rapids/src/features/phone_esm/straw/esm_activities.py

import pandas as pd
import numpy as np

id2qc = {  44:["What have you mainly been doing within the last 10 minutes?",
                            "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
                            "Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
                        45:["What type of individual work?",
                            "Wat voor soort individueel werk?",
                            "Kakšno vrsto samostojnega dela ste opravljali?"],
                        46:["How did you work with others?",
                            "Hoe heb je met anderen gewerkt?",
                            "Kako ste sodelovali z drugimi?"],
                        47:["What type of break?",
                            "Wat voor soort pauze?",
                            "Kakšno vrsto odmora ste imeli?"],
                        48:["Where did you travel between?",
                            "Waar heb je tussen gereisd?",
                            "Kam ste potovali?"],
                        49:["Did you use a computer or phone for that?",
                            "Heb je daarvoor een computer of telefoon gebruikt?",
                            "Ste za to uporabljali računalnik ali telefon?"],
                        50:["What kind of an interaction was that?",
                            "Wat voor interactie was dat?",
                            "Kakšne vrste sodelovanja je bilo to?"],
                        51:["How many people were involved besides yourself?",
                            "Hoeveel mensen waren er behalve jezelf betrokken?",
                            "Koliko oseb je bilo poleg vas še vpletenih?"],
                        # 52:["What have you mainly been doing within the last 10 minutes?",
                        #     "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
                        #     "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
}
qc2id = {v:k for k,values in id2qc.items() for v in values}

next_questions = {  44: [45,46,47,48],
                    45:[49,49],
                    46:[50,50],
                    47:[],
                    48:[],
                    49:[],
                    50:[51,51],
                    51:[]
                    #52:[45,46,47,48],
                 }

def esm_activities_LTM_features(
    df_esm_activities_cleaned: pd.DataFrame,
) -> pd.DataFrame:
    """ Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
        to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to
        find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.

    Parameters
    ----------
    df_esm_activities_cleaned: pd.DataFrame
        A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.

    Returns
    -------
    df_esm_activities_cleaned: pd.DataFrame
        The same dataframe with columns which contain:
            ["correct_ids"] - Corrected question_ids
            ["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
            ["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
    """
    #TODO: preprocess questionaires
        #DONE: correct ids
    correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
        #DONE: process subquestions
    ids = correct_id_df["correct_ids"]
    main_q_indices = ids[ids==44].index
    q_group = []
    i=-1
    for id in ids:
        if(id==44):
            i=i+1
        q_group.append(i)
    correct_id_df["q_group"] = q_group
    ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
    ans_seq.set_index(main_q_indices,inplace=True)
    # correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
    # correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
    #DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
    processed_ans_df = process_answers(ans_seq)
    # df_out = df_esm_activities_cleaned.join(test)
    return df_esm_activities_cleaned.join(processed_ans_df)


"""
possible answer sequences for LTM question chains

#alone
0,0,0 not social
0,0,1 not social
0,1,0 not social
0,1,1 not social
0,2 not social
0,3 not social
0,4 not social
0,5 not social
0,6 not social
#w/ others
1,0,0,0 1 irl
1,0,0,1 2 irl
1,0,0,2 3+ irl
1,0,1,0 1 irl
1,0,1,1 2 irl
1,0,1,2 3+ irl
1,1,0,0 1 online
1,1,0,1 2 online
1,1,0,2 3+ online
1,1,1,0 1 online
1,1,1,1 2 online
1,1,1,2 3+ online
1,2 positive likely to be more than 2
1,3 positive
#break
2,0 ambiguous
2,1 positive irl
2,2 ambiguous
2,3 ambiguous
#transit
3,0 ambiguous
3,1 ambiguous
3,2 ambiguous
"""


#TODO: docstring
def process_answers(df:pd.DataFrame)-> pd.DataFrame:
    """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
        > n_others: Number of other people interacted with in the last 10 minutes
            - -1: Number is positive but unknown exactly
            - 0: No people/alone
            - 1: One extra person
            - 2: Two extra people
            - 3: More than two extra people
            - NaN : Can't say anything with enough certainty.
        > inperson:
            - True/False: The interaction in question was/wasn't in person.
            - None: Can't say anything with enough certainty.
        > formal:
            - True/False: The interaction in question was/wasn't formal.
            - None: Can't say anything with enough certainty.
    Args:
        df (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: _description_
    """
    properties = {"n_others":[],
                  "inperson":[],
                  "formal":[]}
    for ans_seq in df["ans_seq"]:
        n_other = None
        inperson = None
        formal = None
        if(ans_seq[0]==0):
            n_other = 0
        elif(ans_seq[0]==1):
            if(ans_seq[1]==3):
                n_other = -1    # anwsered "Other" but did work with other people
            elif(ans_seq[1]==2):
                n_other = 3 #assuming more than 2 people participated in the lecture or presentation
            elif(ans_seq[1] in [0,1]):
                    inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
                    formal = ans_seq[2]==0#0 means formal
                    n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
        elif(ans_seq[0]==2):
            formal = False#assuming one does not have a formal meeting during break time
            if(ans_seq[1]==1):
                n_other = -1
                inperson = True
            #if not 1 then we dont know anythong for sure
        elif(ans_seq[0]==3):
            #we cant say whether the persion was carpooling or driving alone.
            pass
        properties["n_others"].append(n_other)
        properties["inperson"].append(inperson)
        properties["formal"].append(formal)


    #df = df.join(pd.DataFrame(properties,index=df.index))
    return pd.DataFrame(properties,index=df.index)

def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
    """_summary_

    Args:
        df (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: Input dataframe with added column "correct_ids"
    """
    df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
    return df


def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
    """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
        > n_others: Number of other people interacted with in the last 10 minutes
            - -1: Number is positive but unknown exactly
            - 0: No people/alone
            - 1: One extra person
            - 2: Two extra people
            - 3: More than two extra people
            - NaN : Can't say anything with enough certainty.
        > inperson:
            - True/False: The interaction in question was/wasn't in person.
            - None: Can't say anything with enough certainty.
        > formal:
            - True/False: The interaction in question was/wasn't formal.
            - None: Can't say anything with enough certainty.
    Args:
        df (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: _description_
    """

    #print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
    properties = {"n_others":[],
                  "inperson":[],
                  "formal":[]}
    ans_seq = df["esm_user_answer_numeric"].values
    n_other = None
    inperson = None
    formal = None
    if(ans_seq[0]==0):
        n_other = 0
    elif(ans_seq[0]==1):
        if(ans_seq[1]==3):
            n_other = -1    # anwsered "Other" but did work with other people
        elif(ans_seq[1]==2):
            n_other = 3 #assuming more than 2 people participated in the lecture or presentation
        elif(ans_seq[1] in [0,1]):
                inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
                formal = ans_seq[2]==0#0 means formal
                n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
    elif(ans_seq[0]==2):
        formal = False#assuming one does not have a formal meeting during break time
        if(ans_seq[1]==1):
            n_other = -1
            inperson = True
        #if not 1 then we dont know anythong for sure
    elif(ans_seq[0]==3):
        #we cant say whether the persion was carpooling or driving alone.
        pass
    properties["n_others"].append(n_other)
    properties["inperson"].append(inperson)
    properties["formal"].append(formal)


    df = df.join(pd.DataFrame(properties,index=df.index))
    #print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])

    return df


#test stuff
def test():
    from esm_preprocess import preprocess_esm,clean_up_esm
    df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
    df = preprocess_esm(df)
    df = clean_up_esm(df)
    df = df[df["questionnaire_id"]==97]
    original = esm_activities_LTM_features(df)
    df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
    temp = df.groupby("local_segment")
    temp2 = temp.apply(process_answers_aggregation)

    #compare with original function results
    selection = original[original["correct_ids"]==44][["n_others",  "inperson", "formal"]]
    temp_selection = temp2.loc[selection.index]
    temp_selection.compare(selection,keep_shape=True,keep_equal =True)

    #print out ans_seq processing results
    # import json
    # i = 0
    # for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
    #     obj = json.loads(j)
    #     text = obj["esm_instructions"]
    #     if ("10 minut" in text):
    #         print("---\n",test.ans_seq.iloc[i])
    #         print(test[["n_others","inperson","formal"]].values[i])
    #         i = i+1
    #     print(text,ans)

#test()