defined func identify_screen_sequences

2021-09-10 16:47:37 +02:00 · 2021-09-10 16:47:37 +02:00 · cf7e692927
parent e2e268148d
commit cf7e692927
4 changed files with 826 additions and 13 deletions
--- a/config/environment.yml
+++ b/config/environment.yml
@ -19,4 +19,4 @@ dependencies:
  - scikit-learn
  - sqlalchemy
  - statsmodels
-  - tabulate
+  - tabulate
--- a/exploration/expl_screen.py
+++ b/exploration/expl_screen.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.11.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -38,6 +38,12 @@ print(df_screen_nokia)
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_screen_inactive = get_screen_data(participants_inactive_usernames)

+# %%
+df_screen_inactive.head(60)
+
+# %%
+df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
+
 # %%
 df_screen_inactive["screen_status"] = (
    df_screen_inactive["screen_status"]
--- a/features/screen.py
+++ b/features/screen.py
@ -1,6 +1,7 @@
 from collections.abc import Collection

 import pandas as pd
+import re

 from config.models import Participant, Screen
 from setup import db_engine, session
@ -33,17 +34,139 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:


 def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
-    # TODO Implement a method that identifies "interesting" sequences of screen statuses.
-    # The main one are:
-    # - OFF -> ON -> unlocked (a true phone unlock)
-    # - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
-    # Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
-    # "I have also seen
-    # off -> on -> unlocked (with 2 - locked missing)
-    # and
-    # off -> locked -> on -> off -> locked (*again*)."
-    # Either clean the data beforehand or deal with these inconsistencies in this function.
-    pass
+    """
+    Identify interesting sequences (unlock, status check) and return them in a dataframe.
+
+    Parameters
+    ----------
+    df_screen: pd.DataFrame
+        A dataframe containing screen data
+        
+    Returns
+    -------
+    df_sequences: pd.DataFrame
+        A dataframe containing information on screen sequences
+
+        Columns:
+            * participant_id
+            * device_id
+            * sequence_type: unlock/check
+            * beginning: beginning of unlock/check in miliseconds since 1970
+            * end: end of unlock/check in miliseconds since 1970
+            * duration
+
+    Heuristics
+    ----------
+    1) In the category of unlock sequences, the following sequences were counted:
+        i) 0130(0...)2
+            This is the paradigmatic case. It is allowed for the screen status 0 (off)
+            to be reported multiple times in a row.
+        ii) 21302
+            If the previous sequence has ended with the screen status 2 (e.g. unlock),
+            the unlock sequence does not start with a 0 but rather with a 2.
+        iii) (0|2)3102
+            It is allowed fot the order of 3 and 1 to be reversed. If the device is
+            unlocked e.g. with a fingerprint-reader, it can happen that the unlock
+            precedes the ON status.
+    2) In the category of screen-check sequences, the following sequences were counted:
+        i) 010
+            The base case.
+        ii) 210
+            Refer to point 1) ii).
+    3) Special cases:
+        i) (2|0)102
+            The occurance of two consecutive "locked" events with no intermediate "unlocked" event
+            is an inconsistency, however due to its frequency it has to be dealt with in some way.
+            Since the time interval between the last two events of this sequence is commonly very
+            short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
+            sequence.
+        ii) (2|0)130102
+            This sequence is interpreted as a nested screen-check sequence (010) inside
+            a unlock sequence ((2|0)1302).    
+    """
+
+    df_screen.sort_values(["device_id", "timestamp"], inplace=True)
+
+    groups = df_screen.groupby("device_id")
+
+    # Create a df containing, for each device, a string representing the sequence of
+    # screen events in chronological order, e.g. "01301302130202130..."
+    
+    df_screen_sequences = (
+        groups["screen_status"]
+        .apply(list)
+        .apply(lambda list_: "".join([str(x) for x in list_]))
+        .to_frame()
+        .reset_index()
+    )
+
+    # Create a df containing, for each device, a list of timestamps of screen events
+    # in chronological order, e.g. [1581933295955, 1581933741144, ...]
+
+    df_timestamps = (
+        groups["timestamp"]
+        .apply(list)
+        .to_frame()
+        .reset_index()
+    )
+
+    # Create a df containing information to which participant the devices belong
+
+    df_participants = (
+        df_screen[["device_id", "participant_id"]]
+        .drop_duplicates()
+        .reset_index()
+        .drop("index", 1)
+    )
+
+    df_merged = (
+        df_screen_sequences.merge(df_timestamps, on="device_id")
+        .merge(df_participants, on="device_id")
+    )
+
+    # Regex patterns implementing the heuristics described in the docstring.
+    # Since the matching sequences can overlap, lookahead is used. Note that
+    # the first event in a sequence isn't part of the group caught inside the
+    # lookahead. That's because the first event in a sequence is also the last
+    # event of the previous sequence, so that the time interval between the first
+    # and the second event in a sequence is actually the time the device is not in use.
+
+    unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
+    check_pat = re.compile(r"(?=[0,2](10+2?))")
+
+    # Iterate over rows of the merged df and then for each row iterate over
+    # regex mathes. For each match, create a dictionary containing information
+    # on the matched sequence and append it to the list of rows. Lastly, create
+    # a new dataframe from the list of rows and return it.
+
+    rows_list = list()
+    for index, row in df_merged.iterrows():
+        for match in unlock_pat.finditer(row["screen_status"]):
+            beginning = row["timestamp"][match.start(1)]
+            end = row["timestamp"][match.end(1) - 1]
+            new_row_dict = {
+                "participant_id": row["participant_id"],
+                "device_id": row["device_id"],
+                "sequence_type": "unlock",
+                "beginning": beginning,
+                "end": end
+            }
+            rows_list.append(new_row_dict)
+        for match in check_pat.finditer(row["screen_status"]):
+            beginning = row["timestamp"][match.start(1)]
+            end = row["timestamp"][match.end(1) - 1]
+            new_row_dict = {
+                "participant_id": row["participant_id"],
+                "device_id": row["device_id"],
+                "sequence_type": "check",
+                "beginning": beginning,
+                "end": end
+            }
+            rows_list.append(new_row_dict)
+    df_sequences = pd.DataFrame(rows_list)
+    df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
+
+    return df_sequences


 def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
--- a/test/test_screen_sequences.ipnyb
+++ b/test/test_screen_sequences.ipnyb