from collections.abc import Collection import pandas as pd import re from config.models import Participant, Screen from setup import db_engine, session screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"} def get_screen_data(usernames: Collection) -> pd.DataFrame: """ Read the data from the screen table and return it in a dataframe. Parameters ---------- usernames: Collection A list of usernames to put into the WHERE condition. Returns ------- df_screen: pd.DataFrame A dataframe of screen data. """ query_screen = ( session.query(Screen, Participant.username) .filter(Participant.id == Screen.participant_id) .filter(Participant.username.in_(usernames)) ) with db_engine.connect() as connection: df_screen = pd.read_sql(query_screen.statement, connection) return df_screen def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame: """ Identify interesting sequences (unlock, status check) and return them in a dataframe. Parameters ---------- df_screen: pd.DataFrame A dataframe containing screen data Returns ------- df_sequences: pd.DataFrame A dataframe containing information on screen sequences Columns: * participant_id * device_id * sequence_type: unlock/check * beginning: beginning of unlock/check in miliseconds since 1970 * end: end of unlock/check in miliseconds since 1970 * duration Heuristics ---------- 1) In the category of unlock sequences, the following sequences were counted: i) 0130(0...)2 This is the paradigmatic case. It is allowed for the screen status 0 (off) to be reported multiple times in a row. ii) 21302 If the previous sequence has ended with the screen status 2 (e.g. unlock), the unlock sequence does not start with a 0 but rather with a 2. iii) (0|2)3102 It is allowed fot the order of 3 and 1 to be reversed. If the device is unlocked e.g. with a fingerprint-reader, it can happen that the unlock precedes the ON status. 2) In the category of screen-check sequences, the following sequences were counted: i) 010 The base case. ii) 210 Refer to point 1) ii). 3) Special cases: i) (2|0)102 The occurance of two consecutive "locked" events with no intermediate "unlocked" event is an inconsistency, however due to its frequency it has to be dealt with in some way. Since the time interval between the last two events of this sequence is commonly very short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check sequence. ii) (2|0)130102 This sequence is interpreted as a nested screen-check sequence (010) inside a unlock sequence ((2|0)1302). """ df_screen.sort_values(["device_id", "timestamp"], inplace=True) groups = df_screen.groupby("device_id") # Create a df containing, for each device, a string representing the sequence of # screen events in chronological order, e.g. "01301302130202130..." df_screen_sequences = ( groups["screen_status"] .apply(list) .apply(lambda list_: "".join([str(x) for x in list_])) .to_frame() .reset_index() ) # Create a df containing, for each device, a list of timestamps of screen events # in chronological order, e.g. [1581933295955, 1581933741144, ...] df_timestamps = ( groups["timestamp"] .apply(list) .to_frame() .reset_index() ) # Create a df containing information to which participant the devices belong df_participants = ( df_screen[["device_id", "participant_id"]] .drop_duplicates() .reset_index() .drop("index", 1) ) df_merged = ( df_screen_sequences.merge(df_timestamps, on="device_id") .merge(df_participants, on="device_id") ) # Regex patterns implementing the heuristics described in the docstring. # Since the matching sequences can overlap, lookahead is used. Note that # the first event in a sequence isn't part of the group caught inside the # lookahead. That's because the first event in a sequence is also the last # event of the previous sequence, so that the time interval between the first # and the second event in a sequence is actually the time the device is not in use. unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))") check_pat = re.compile(r"(?=[0,2](10+2?))") # Iterate over rows of the merged df and then for each row iterate over # regex mathes. For each match, create a dictionary containing information # on the matched sequence and append it to the list of rows. Lastly, create # a new dataframe from the list of rows and return it. rows_list = list() for index, row in df_merged.iterrows(): for match in unlock_pat.finditer(row["screen_status"]): beginning = row["timestamp"][match.start(1)] end = row["timestamp"][match.end(1) - 1] new_row_dict = { "participant_id": row["participant_id"], "device_id": row["device_id"], "sequence_type": "unlock", "beginning": beginning, "end": end } rows_list.append(new_row_dict) for match in check_pat.finditer(row["screen_status"]): beginning = row["timestamp"][match.start(1)] end = row["timestamp"][match.end(1) - 1] new_row_dict = { "participant_id": row["participant_id"], "device_id": row["device_id"], "sequence_type": "check", "beginning": beginning, "end": end } rows_list.append(new_row_dict) df_sequences = pd.DataFrame(rows_list) df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"] return df_sequences def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame: # TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions. # For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated, # i.e. how long the screen was unlocked. # Another example might be the average time between screen unlocks and/or screen status checks. pass