Ignore existing data.

partially resolved the grouping issue
implemented time_screen_sequence
2023-02-10 11:04:46 +01:00 · 2021-09-22 16:34:33 +02:00 · 2021-09-13 11:47:19 +02:00 · 2021-09-10 16:47:37 +02:00
7 changed files with 1706 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ __pycache__/
 /exploration/*.ipynb
 /config/*.ipynb
 /statistical_analysis/*.ipynb
+data/*
--- a/config/environment.yml
+++ b/config/environment.yml
@ -19,4 +19,4 @@ dependencies:
  - scikit-learn
  - sqlalchemy
  - statsmodels
-  - tabulate
+  - tabulate
--- a/config/models.py
+++ b/config/models.py
@ -356,7 +356,16 @@ class Proximity(Base, AWAREsensor):


 class Screen(Base, AWAREsensor):
+    """
+    Contains the screen sensor information.
+
+    Attributes
+    ----------
+    screen_status: int
+        Screen status (0 – off, 1 – on, 2 – locked, 3 – unlocked)
+    """
    screen_status = Column(SmallInteger)
+    


 class SMS(Base, AWAREsensor):
--- a/exploration/expl_screen.py
+++ b/exploration/expl_screen.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.11.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -38,6 +38,12 @@ print(df_screen_nokia)
 participants_inactive_usernames = participants.query_db.get_usernames()
 df_screen_inactive = get_screen_data(participants_inactive_usernames)

+# %%
+df_screen_inactive.head(60)
+
+# %%
+df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
+
 # %%
 df_screen_inactive["screen_status"] = (
    df_screen_inactive["screen_status"]
--- a/features/.vscode/settings.json
+++ b/features/.vscode/settings.json
@ -0,0 +1,4 @@
+{
+    "python.linting.enabled": true,
+    "python.formatting.provider": "autopep8"
+}
--- a/features/screen.py
+++ b/features/screen.py
@ -1,6 +1,10 @@
 from collections.abc import Collection

 import pandas as pd
+import numpy as np
+import re
+
+from typing import Tuple, ValuesView

 from config.models import Participant, Screen
 from setup import db_engine, session
@ -32,23 +36,320 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
    return df_screen


-def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
-    # TODO Implement a method that identifies "interesting" sequences of screen statuses.
-    # The main one are:
-    # - OFF -> ON -> unlocked (a true phone unlock)
-    # - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
-    # Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
-    # "I have also seen
-    # off -> on -> unlocked (with 2 - locked missing)
-    # and
-    # off -> locked -> on -> off -> locked (*again*)."
-    # Either clean the data beforehand or deal with these inconsistencies in this function.
-    pass
+def identify_screen_sequence(df_screen: pd.DataFrame, grouping: bool = False) -> pd.DataFrame:
+    """
+    Identifes interesting sequences (unlock, status check) and returns them in a dataframe.
+    Transform the grouping of screen events (by day, hour...) into a grouping of sequences.
+
+    Parameters
+    ----------
+    df_screen: pd.DataFrame
+        A dataframe containing screen data and a column "group".
+        N.B.: the values in the column "group" must be of a comparable type (e.g. int, datetime.date etc.)
+    grouping:
+        A boolean value indicating whether the input df contains the columng "group".
+
+    Returns
+    -------
+    df_sequences: pd.DataFrame:
+        A dataframe containing information on screen sequences
+
+        Columns:
+            - participant_id
+            - device_id
+            - seq_id: an unique id assigned to each sequence
+            - sequence_type: unlock/check
+            - group: the group to which the sequence belongs, i.e. the timespan during which it has 
+                occured. Note that in the case that it spans over a longer period of time,
+                the same sequence is assigned to multiple groups
+            - beginning_abs: beginning of unlock/check [ms since 1970]
+            - end_abs: end of unlock/check in [ms since 1970]
+            - duration_abs [ms]
+            - beginning_rel: beginning of a sequence relative to the group [ms since 1970]
+            - end_rel [ms since 1970]
+            - duration_rel [ms since 1970]
+
+    Legend
+    ------
+    - 0: off
+    - 1: on
+    - 2: locked
+    - 3: unlocked
+
+    Grouping
+    --------
+    If the screen events of the input df are assigned a time structure, the identified sequences should also
+    be. If all of the screen events constituing a sequence are in the same group, assingning the sequence to
+    a group is trivial - it should be the group the events belong to. If, on the other hand, the situation is
+    trickier. As of the moment, the procedure is implemented as follows:
+        The relative beginning (relative to a certain group, i.e. timespan) is defined as the timestamp of the 
+        first event belonging to the group in question. Relative end and relative duration are defined in a
+        similar fashion. This is, however, not optimal. We would namely wish, e.g., that the relative durations
+        of a given sequence would sum up to its absolute duration which is not yet the case.
+        TODO    In order to achieve this, we would need to be given more information on the groups. The current 
+        TODO    constraint on the groups is only that they be comparable. This is insufficient since it is
+        TODO    impossible to infer:
+        TODO        - how many and which groups lie between two given groups
+        TODO        - when in time does a certain group begin and when does it end
+        TODO    In order to mitigate these issues, we would need to be given a complete list of groups and the
+        TODO    groups should have the form of an interval (beginning, end).
+        
+        In fact, under the presupposition that we will always be working with relatively big dataframes,
+        the requirement of being given a complete list of groups becomes unnecessary.
+        cf* the highlighted comment below.
+
+    Heuristics
+    ----------
+    1) In the category of unlock sequences, the following sequences were counted:
+        i) 0130(0...)2
+            This is the paradigmatic case. It is allowed for the screen status 0 (off)
+            to be reported multiple times in a row.
+        ii) 21302
+            If the previous sequence has ended with the screen status 2 (e.g. unlock),
+            the unlock sequence does not start with a 0 but rather with a 2.
+        iii) (0|2)3102
+            It is allowed fot the order of 3 and 1 to be reversed. If the device is
+            unlocked e.g. with a fingerprint-reader, it can happen that the unlock
+            precedes the ON status.
+    2) In the category of screen-check sequences, the following sequences were counted:
+        i) 010
+            The base case.
+        ii) 210
+            Refer to point 1) ii).
+    3) Special cases:
+        i) (2|0)102
+            The occurance of two consecutive "locked" events with no intermediate "unlocked" event
+            is an inconsistency, however due to its frequency it has to be dealt with in some way.
+            Since the time interval between the last two events of this sequence is commonly very
+            short (around 30ms), the 2 at the end should be interpreted as part of the SCREEN-CHECK
+            SEQUENCE.
+        ii) (2|0)130102
+            This sequence is interpreted as a nested screen-check sequence (010) inside
+            an unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
+            short (the device hasn't even had the time to lock), we say that 010 does not costitute a
+            proper check sequence and we therefore interpret the whole sequence as an UNLOCK SEQUENCE.
+    """
+    
+    # If the time structure of sequences is not of interest, all events should be assigned to the same group
+    if not grouping:
+        df_screen["group"] = 0
+
+    df_screen.sort_values(["device_id", "timestamp"], inplace=True)
+
+    # Create a df containing for each device a row with the following columns:
+    #   - participant_id: the id of the participant the device belongs to
+    #   - screen_status: 
+    #       a string representing the sequence of screen events
+    #       in chronological order, e.g. "01301302130202130..."
+    #   - timestamp: 
+    #       a list of timestamps of screen events
+    #       in chronological order, e.g. [1581933295955, 1581933741144, ...]
+    #   - group:
+    #       a list of groups to which the screen events
+    #       belong, again, in cronological order
+
+    df_sequences_timestamps_groups = (
+        df_screen.groupby(["device_id", "participant_id"])
+        .agg({
+            "screen_status": lambda list_: "".join([str(x) for x in list_]),
+            "timestamp": list,
+            "group": list})
+        .reset_index()
+    )
+
+    # Regex patterns implementing the heuristics described in the docstring.
+    # Since the matching sequences can overlap, lookahead is used. Note that
+    # the first event in a sequence isn't part of the group caught inside the
+    # lookahead. That's because the first event in a sequence is also the last
+    # event of the previous sequence, so that the time interval between the first
+    # and the second event in a sequence is actually the time the device is not in use.
+
+    unlock_pat = re.compile(
+        # Begin the lookahead group. Inside the lookahead group,
+        # first match either a 0 or a 2
+        r"(?=[0,2]"
+        # Begin the 1st capturing group, this is the one we are interested in.
+        # Match either a 13 or a 31.
+        r"((13|31)"
+        # Match either a (nonzero) sequence of consecutive 0s or a 010. Than, match
+        # a two. End the 1st capturing group. End the lookahead group.
+        r"(0+|010)2))"
+        )
+    check_pat = re.compile(
+        # Begin the lookahead group. Inside the lookahead group,
+        # first match either a 0 or a 2
+        r"(?=[0,2]"
+        # Begin the 1st capturing group. Capture a 1 succeeded by several 0s.
+        # End the 1st captouring group. End the lookahead group.
+        r"(10+))"
+        )
+
+    # Enumerate the groups based on increasing order in order to make iteration easier.
+    # //!   N.B.: this is also a possible way to ease the constraint on the groups
+    # //!   discussed in the docstring under "Grouping". Namely, when working with
+    # //!   reasonably big dataframes, it can be confidently expected that for each
+    # //!   group we are interested in there will be at least one screen event assigned
+    # //!   to it. In this case, the following procedure will extract the complete list of groups. 
+
+    def enumerate_groups(df: pd.DataFrame) -> Tuple[dict, dict]:
+        groups_list = sorted(list(set(df["group"].tolist())))
+        group_dict = dict(enumerate(groups_list))
+        inv_group_dict = dict([(snd, fst) for (fst, snd) in group_dict.items()])
+        return group_dict, inv_group_dict
+    
+    # Try to sort and enumerate groups and raise an error if impossible
+
+    try:
+        group_dict, inv_group_dict = enumerate_groups(df_screen)
+    except TypeError as e:
+        raise e("Values in the column 'group' must be of a comparable type")
+
+    # Iterate over rows of the merged df and then for each row iterate over
+    # regex mathes. For each match, create a dictionary containing information
+    # on the matched sequence and append it to the list of rows. Lastly, create
+    # a new dataframe from the list of rows and return it.
+    
+    seq_id = 0
+    
+    def identify_seq(regexp: re.Pattern, label: str) -> list:
+        """
+        Iterates over rows of df_sequences_timestamps_groups, then, for each group 
+        iterates over regex matches. For each regex match, i.e. for each identyfied
+        sequence, iterates over the groups over which the sequence spans (*cf docstring).
+        """
+        nonlocal seq_id
+        rows_list = list()
+        for index, row in df_sequences_timestamps_groups.iterrows():
+            for match in regexp.finditer(row["screen_status"]):
+                beginning_index_abs, end_index_abs = match.start(1), match.end(1)
+                groups = set(row["group"][beginning_index_abs : end_index_abs])
+                group_ids = {inv_group_dict[grp] for grp in groups}
+                # TODO Here's part of the problem: the span of relevant groups consists solely
+                # TODO of those to which at least one screen event is assigned (in the whole df) 
+                span = range(min(group_ids), max(group_ids) + 1)            
+                for grp_id in span:
+                    grp = group_dict[grp_id]
+                    grp_indices = [
+                      index for index
+                      in range(beginning_index_abs, end_index_abs)
+                      if row["group"][index] == grp
+                      ]
+                    # TODO Here, we face the converse problem. It may happen that a sequence in fact
+                    # TODO does span over a certain group although none of the events that constitute
+                    # TODO it are assigned to it. In this case, there is no way to calculate the relative
+                    # TODO beginning and end (which should just be the beginning and end of the group).
+                    try:
+                        beginning_index_rel, end_index_rel = min(grp_indices), max(grp_indices)
+                        beginning_rel = row["timestamp"][beginning_index_rel]
+                        end_rel = row["timestamp"][end_index_rel]
+                    except ValueError:
+                        beginning_rel = end_rel = pd.NA
+                    beginning_abs = row["timestamp"][beginning_index_abs]
+                    end_abs = row["timestamp"][end_index_abs - 1]
+                    new_row_dict = {
+                        "participant_id": row["participant_id"],
+                        "device_id": row["device_id"],
+                        "seq_id": seq_id,
+                        "sequence_type": label,
+                        "group": grp,
+                        "beginning_abs": beginning_abs,
+                        "end_abs": end_abs,
+                        "duration_abs": end_abs - beginning_abs,
+                        "beginning_rel": beginning_rel,
+                        "end_rel": end_rel,
+                        "duration_rel": end_rel - beginning_rel
+                    }
+                    rows_list.append(new_row_dict)
+                seq_id += 1
+        return rows_list
+
+    rows_unlock = identify_seq(unlock_pat, "unlock")
+    rows_check = identify_seq(check_pat, "check")
+
+    df_sequences = pd.DataFrame(rows_unlock + rows_check)
+
+    return df_sequences


-def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
-    # TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
-    # For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
-    #   i.e. how long the screen was unlocked.
-    # Another example might be the average time between screen unlocks and/or screen status checks.
-    pass
+# def time_screen_sequence(df_screen: pd.DataFrame, groupby: str = "date") -> pd.DataFrame:
+#     """
+#     Calculates time statistics related to device usage.
+
+#     Parameters
+#     ----------
+#     df_screen: pd.DataFrame
+#         A dataframe containing screen data
+
+#     Returns
+#     -------
+#     A new dataframe indexed by device_id and participant_id containing the followig collumns:
+#         - total_usage_time: sum of daily timespans between the last and 
+#             the first event reported by the screen sensor measured in milliseconds
+#         - real_usage_time: duration of time during which the device was actually in use, 
+#             i.e. the total duration of sequences identified by the function identify_screen_sequence
+#         - real_usage_time_percentage: real_usage_time / total_usage_time
+#         - average_time_between_unlocks
+#         - average_time_between_checks
+#         - average_check_duration
+#         - average_unlock_duration
+#     """
+
+#     sequences_df = identify_screen_sequence(df_screen)
+
+
+
+#     # Calculate the date of the beginning and of the end of a sequence.
+#     # Drop those sequences which span over several days.
+#     sequences_df["date_beginning"] = pd.to_datetime(
+#         sequences_df.beginning, unit="ms").dt.date
+
+#     sequences_df["date_end"] = pd.to_datetime(
+#         sequences_df.end, unit="ms").dt.date
+
+#     sequences_df = (
+#         sequences_df
+#         [sequences_df["date_beginning"] == sequences_df["date_end"]]
+#         .drop(columns=["date_end"])
+#         .rename(columns={"date_beginning":"date"})
+#     )
+
+#     # Calculate the time the device was in use
+#     usage_time_df = (
+#         sequences_df.groupby(["sequence_type", "participant_id", "device_id", "date"])
+#         .agg({"duration":"sum"})
+#         .apply(lambda x: x//1000, "columns")
+#         .rename(columns={"duration":"usage_time"})
+#         )
+
+#     # Calculate the average time between sequences
+#     average_timedelta_df = (
+#         sequences_df.sort_values("beginning")
+#         .groupby(["sequence_type", "participant_id", "device_id", "date"])
+#         .apply(
+#             lambda grp:
+#             grp.assign(end_shifted = grp["end"].shift(1))
+#         )
+#         .drop(columns=["participant_id", "device_id", "sequence_type", "date"])
+#         .droplevel(-1)
+#         .assign(average_timedelta = lambda x: x.beginning - x.end_shifted)
+#         .groupby(["sequence_type", "participant_id", "device_id", "date"])
+#         .agg({"average_timedelta": lambda x: np.mean(x)//1000})
+#     )
+
+#     # Calculate the average duration of sequences
+#     average_duration_df = (
+#         sequences_df
+#         .groupby(["sequence_type", "participant_id", "device_id", "date"])
+#         .agg({"duration": (lambda x: np.mean(x)//1000)})
+#         .rename(columns={"duration":"average_duration"})
+#     )
+
+#     # Merge into a single dataframe
+#     merged = pd.merge(
+#         pd.merge(usage_time_df, average_timedelta_df, left_index=True, right_index=True),
+#         average_duration_df,
+#         left_index=True,
+#         right_index=True
+#     )
+
+#     return merged
--- a/test/test_screen_sequences.ipynb
+++ b/test/test_screen_sequences.ipynb
Author	SHA1	Message	Date
Primoz	0a98bab78d	Ignore existing data.	2023-02-10 11:04:46 +01:00
Ivan Kobe	47ecd4bc02	partially resolved the grouping issue	2021-09-22 16:34:33 +02:00
Ivan Kobe	53df652d02	implemented time_screen_sequence	2021-09-13 11:47:19 +02:00
Ivan Kobe	cf7e692927	defined func identify_screen_sequences	2021-09-10 16:47:37 +02:00