defined func identify_screen_sequences

2021-09-10 16:47:37 +02:00 · 2021-09-10 16:47:37 +02:00 · cf7e692927
parent e2e268148d
commit cf7e692927
4 changed files with 826 additions and 13 deletions
--- a/exploration/expl_screen.py
+++ b/exploration/expl_screen.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.11.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -38,6 +38,12 @@ print(df_screen_nokia)
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_screen_inactive = get_screen_data(participants_inactive_usernames)
 # %%
 df_screen_inactive.head(60)
 # %%
 df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
 # %%
 df_screen_inactive["screen_status"] = (
    df_screen_inactive["screen_status"]
--- a/features/screen.py
+++ b/features/screen.py
@ -1,6 +1,7 @@
 from collections.abc import Collection
 import pandas as pd
 import re
 from config.models import Participant, Screen
 from setup import db_engine, session
@ -33,17 +34,139 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
 def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
-    # TODO Implement a method that identifies "interesting" sequences of screen statuses.
+    """
-    # The main one are:
+    Identify interesting sequences (unlock, status check) and return them in a dataframe.
-    # - OFF -> ON -> unlocked (a true phone unlock)
+
-    # - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
+    Parameters
-    # Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
+    ----------
-    # "I have also seen
+    df_screen: pd.DataFrame
-    # off -> on -> unlocked (with 2 - locked missing)
+        A dataframe containing screen data
-    # and
+        
-    # off -> locked -> on -> off -> locked (*again*)."
+    Returns
-    # Either clean the data beforehand or deal with these inconsistencies in this function.
+    -------
-    pass
+    df_sequences: pd.DataFrame
        A dataframe containing information on screen sequences
        Columns:
            * participant_id
            * device_id
            * sequence_type: unlock/check
            * beginning: beginning of unlock/check in miliseconds since 1970
            * end: end of unlock/check in miliseconds since 1970
            * duration
    Heuristics
    ----------
    1) In the category of unlock sequences, the following sequences were counted:
        i) 0130(0...)2
            This is the paradigmatic case. It is allowed for the screen status 0 (off)
            to be reported multiple times in a row.
        ii) 21302
            If the previous sequence has ended with the screen status 2 (e.g. unlock),
            the unlock sequence does not start with a 0 but rather with a 2.
        iii) (0|2)3102
            It is allowed fot the order of 3 and 1 to be reversed. If the device is
            unlocked e.g. with a fingerprint-reader, it can happen that the unlock
            precedes the ON status.
    2) In the category of screen-check sequences, the following sequences were counted:
        i) 010
            The base case.
        ii) 210
            Refer to point 1) ii).
    3) Special cases:
        i) (2|0)102
            The occurance of two consecutive "locked" events with no intermediate "unlocked" event
            is an inconsistency, however due to its frequency it has to be dealt with in some way.
            Since the time interval between the last two events of this sequence is commonly very
            short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
            sequence.
        ii) (2|0)130102
            This sequence is interpreted as a nested screen-check sequence (010) inside
            a unlock sequence ((2|0)1302).    
    """
    df_screen.sort_values(["device_id", "timestamp"], inplace=True)
    groups = df_screen.groupby("device_id")
    # Create a df containing, for each device, a string representing the sequence of
    # screen events in chronological order, e.g. "01301302130202130..."
    df_screen_sequences = (
        groups["screen_status"]
        .apply(list)
        .apply(lambda list_: "".join([str(x) for x in list_]))
        .to_frame()
        .reset_index()
    )
    # Create a df containing, for each device, a list of timestamps of screen events
    # in chronological order, e.g. [1581933295955, 1581933741144, ...]
    df_timestamps = (
        groups["timestamp"]
        .apply(list)
        .to_frame()
        .reset_index()
    )
    # Create a df containing information to which participant the devices belong
    df_participants = (
        df_screen[["device_id", "participant_id"]]
        .drop_duplicates()
        .reset_index()
        .drop("index", 1)
    )
    df_merged = (
        df_screen_sequences.merge(df_timestamps, on="device_id")
        .merge(df_participants, on="device_id")
    )
    # Regex patterns implementing the heuristics described in the docstring.
    # Since the matching sequences can overlap, lookahead is used. Note that
    # the first event in a sequence isn't part of the group caught inside the
    # lookahead. That's because the first event in a sequence is also the last
    # event of the previous sequence, so that the time interval between the first
    # and the second event in a sequence is actually the time the device is not in use.
    unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
    check_pat = re.compile(r"(?=[0,2](10+2?))")
    # Iterate over rows of the merged df and then for each row iterate over
    # regex mathes. For each match, create a dictionary containing information
    # on the matched sequence and append it to the list of rows. Lastly, create
    # a new dataframe from the list of rows and return it.
    rows_list = list()
    for index, row in df_merged.iterrows():
        for match in unlock_pat.finditer(row["screen_status"]):
            beginning = row["timestamp"][match.start(1)]
            end = row["timestamp"][match.end(1) - 1]
            new_row_dict = {
                "participant_id": row["participant_id"],
                "device_id": row["device_id"],
                "sequence_type": "unlock",
                "beginning": beginning,
                "end": end
            }
            rows_list.append(new_row_dict)
        for match in check_pat.finditer(row["screen_status"]):
            beginning = row["timestamp"][match.start(1)]
            end = row["timestamp"][match.end(1) - 1]
            new_row_dict = {
                "participant_id": row["participant_id"],
                "device_id": row["device_id"],
                "sequence_type": "check",
                "beginning": beginning,
                "end": end
            }
            rows_list.append(new_row_dict)
    df_sequences = pd.DataFrame(rows_list)
    df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
    return df_sequences
 def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
--- a/test/test_screen_sequences.ipnyb
+++ b/test/test_screen_sequences.ipnyb