stress_at_work_analysis/features/screen.py

from collections.abc import Collection

import pandas as pd
import re

from config.models import Participant, Screen
from setup import db_engine, session

screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}


def get_screen_data(usernames: Collection) -> pd.DataFrame:
    """
    Read the data from the screen table and return it in a dataframe.

    Parameters
    ----------
    usernames: Collection
        A list of usernames to put into the WHERE condition.

    Returns
    -------
    df_screen: pd.DataFrame
        A dataframe of screen data.
    """
    query_screen = (
        session.query(Screen, Participant.username)
        .filter(Participant.id == Screen.participant_id)
        .filter(Participant.username.in_(usernames))
    )
    with db_engine.connect() as connection:
        df_screen = pd.read_sql(query_screen.statement, connection)
    return df_screen


def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
    """
    Identify interesting sequences (unlock, status check) and return them in a dataframe.

    Parameters
    ----------
    df_screen: pd.DataFrame
        A dataframe containing screen data

    Returns
    -------
    df_sequences: pd.DataFrame
        A dataframe containing information on screen sequences

        Columns:
            * participant_id
            * device_id
            * sequence_type: unlock/check
            * beginning: beginning of unlock/check in miliseconds since 1970
            * end: end of unlock/check in miliseconds since 1970
            * duration

    Heuristics
    ----------
    1) In the category of unlock sequences, the following sequences were counted:
        i) 0130(0...)2
            This is the paradigmatic case. It is allowed for the screen status 0 (off)
            to be reported multiple times in a row.
        ii) 21302
            If the previous sequence has ended with the screen status 2 (e.g. unlock),
            the unlock sequence does not start with a 0 but rather with a 2.
        iii) (0|2)3102
            It is allowed fot the order of 3 and 1 to be reversed. If the device is
            unlocked e.g. with a fingerprint-reader, it can happen that the unlock
            precedes the ON status.
    2) In the category of screen-check sequences, the following sequences were counted:
        i) 010
            The base case.
        ii) 210
            Refer to point 1) ii).
    3) Special cases:
        i) (2|0)102
            The occurance of two consecutive "locked" events with no intermediate "unlocked" event
            is an inconsistency, however due to its frequency it has to be dealt with in some way.
            Since the time interval between the last two events of this sequence is commonly very
            short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
            sequence.
        ii) (2|0)130102
            This sequence is interpreted as a nested screen-check sequence (010) inside
            a unlock sequence ((2|0)1302).
    """

    df_screen.sort_values(["device_id", "timestamp"], inplace=True)

    groups = df_screen.groupby("device_id")

    # Create a df containing, for each device, a string representing the sequence of
    # screen events in chronological order, e.g. "01301302130202130..."

    df_screen_sequences = (
        groups["screen_status"]
        .apply(list)
        .apply(lambda list_: "".join([str(x) for x in list_]))
        .to_frame()
        .reset_index()
    )

    # Create a df containing, for each device, a list of timestamps of screen events
    # in chronological order, e.g. [1581933295955, 1581933741144, ...]

    df_timestamps = (
        groups["timestamp"]
        .apply(list)
        .to_frame()
        .reset_index()
    )

    # Create a df containing information to which participant the devices belong

    df_participants = (
        df_screen[["device_id", "participant_id"]]
        .drop_duplicates()
        .reset_index()
        .drop("index", 1)
    )

    df_merged = (
        df_screen_sequences.merge(df_timestamps, on="device_id")
        .merge(df_participants, on="device_id")
    )

    # Regex patterns implementing the heuristics described in the docstring.
    # Since the matching sequences can overlap, lookahead is used. Note that
    # the first event in a sequence isn't part of the group caught inside the
    # lookahead. That's because the first event in a sequence is also the last
    # event of the previous sequence, so that the time interval between the first
    # and the second event in a sequence is actually the time the device is not in use.

    unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
    check_pat = re.compile(r"(?=[0,2](10+2?))")

    # Iterate over rows of the merged df and then for each row iterate over
    # regex mathes. For each match, create a dictionary containing information
    # on the matched sequence and append it to the list of rows. Lastly, create
    # a new dataframe from the list of rows and return it.

    rows_list = list()
    for index, row in df_merged.iterrows():
        for match in unlock_pat.finditer(row["screen_status"]):
            beginning = row["timestamp"][match.start(1)]
            end = row["timestamp"][match.end(1) - 1]
            new_row_dict = {
                "participant_id": row["participant_id"],
                "device_id": row["device_id"],
                "sequence_type": "unlock",
                "beginning": beginning,
                "end": end
            }
            rows_list.append(new_row_dict)
        for match in check_pat.finditer(row["screen_status"]):
            beginning = row["timestamp"][match.start(1)]
            end = row["timestamp"][match.end(1) - 1]
            new_row_dict = {
                "participant_id": row["participant_id"],
                "device_id": row["device_id"],
                "sequence_type": "check",
                "beginning": beginning,
                "end": end
            }
            rows_list.append(new_row_dict)
    df_sequences = pd.DataFrame(rows_list)
    df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]

    return df_sequences


def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
    # TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
    # For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
    #   i.e. how long the screen was unlocked.
    # Another example might be the average time between screen unlocks and/or screen status checks.
    pass