|
|
|
@ -1,6 +1,10 @@
|
|
|
|
|
from collections.abc import Collection
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from typing import Tuple, ValuesView
|
|
|
|
|
|
|
|
|
|
from config.models import Participant, Screen
|
|
|
|
|
from setup import db_engine, session
|
|
|
|
@ -32,23 +36,320 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
|
|
|
|
|
return df_screen
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
# TODO Implement a method that identifies "interesting" sequences of screen statuses.
|
|
|
|
|
# The main one are:
|
|
|
|
|
# - OFF -> ON -> unlocked (a true phone unlock)
|
|
|
|
|
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
|
|
|
|
|
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
|
|
|
|
|
# "I have also seen
|
|
|
|
|
# off -> on -> unlocked (with 2 - locked missing)
|
|
|
|
|
# and
|
|
|
|
|
# off -> locked -> on -> off -> locked (*again*)."
|
|
|
|
|
# Either clean the data beforehand or deal with these inconsistencies in this function.
|
|
|
|
|
pass
|
|
|
|
|
def identify_screen_sequence(df_screen: pd.DataFrame, grouping: bool = False) -> pd.DataFrame:
|
|
|
|
|
"""
|
|
|
|
|
Identifes interesting sequences (unlock, status check) and returns them in a dataframe.
|
|
|
|
|
Transform the grouping of screen events (by day, hour...) into a grouping of sequences.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
df_screen: pd.DataFrame
|
|
|
|
|
A dataframe containing screen data and a column "group".
|
|
|
|
|
N.B.: the values in the column "group" must be of a comparable type (e.g. int, datetime.date etc.)
|
|
|
|
|
grouping:
|
|
|
|
|
A boolean value indicating whether the input df contains the columng "group".
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
-------
|
|
|
|
|
df_sequences: pd.DataFrame:
|
|
|
|
|
A dataframe containing information on screen sequences
|
|
|
|
|
|
|
|
|
|
Columns:
|
|
|
|
|
- participant_id
|
|
|
|
|
- device_id
|
|
|
|
|
- seq_id: an unique id assigned to each sequence
|
|
|
|
|
- sequence_type: unlock/check
|
|
|
|
|
- group: the group to which the sequence belongs, i.e. the timespan during which it has
|
|
|
|
|
occured. Note that in the case that it spans over a longer period of time,
|
|
|
|
|
the same sequence is assigned to multiple groups
|
|
|
|
|
- beginning_abs: beginning of unlock/check [ms since 1970]
|
|
|
|
|
- end_abs: end of unlock/check in [ms since 1970]
|
|
|
|
|
- duration_abs [ms]
|
|
|
|
|
- beginning_rel: beginning of a sequence relative to the group [ms since 1970]
|
|
|
|
|
- end_rel [ms since 1970]
|
|
|
|
|
- duration_rel [ms since 1970]
|
|
|
|
|
|
|
|
|
|
Legend
|
|
|
|
|
------
|
|
|
|
|
- 0: off
|
|
|
|
|
- 1: on
|
|
|
|
|
- 2: locked
|
|
|
|
|
- 3: unlocked
|
|
|
|
|
|
|
|
|
|
Grouping
|
|
|
|
|
--------
|
|
|
|
|
If the screen events of the input df are assigned a time structure, the identified sequences should also
|
|
|
|
|
be. If all of the screen events constituing a sequence are in the same group, assingning the sequence to
|
|
|
|
|
a group is trivial - it should be the group the events belong to. If, on the other hand, the situation is
|
|
|
|
|
trickier. As of the moment, the procedure is implemented as follows:
|
|
|
|
|
The relative beginning (relative to a certain group, i.e. timespan) is defined as the timestamp of the
|
|
|
|
|
first event belonging to the group in question. Relative end and relative duration are defined in a
|
|
|
|
|
similar fashion. This is, however, not optimal. We would namely wish, e.g., that the relative durations
|
|
|
|
|
of a given sequence would sum up to its absolute duration which is not yet the case.
|
|
|
|
|
TODO In order to achieve this, we would need to be given more information on the groups. The current
|
|
|
|
|
TODO constraint on the groups is only that they be comparable. This is insufficient since it is
|
|
|
|
|
TODO impossible to infer:
|
|
|
|
|
TODO - how many and which groups lie between two given groups
|
|
|
|
|
TODO - when in time does a certain group begin and when does it end
|
|
|
|
|
TODO In order to mitigate these issues, we would need to be given a complete list of groups and the
|
|
|
|
|
TODO groups should have the form of an interval (beginning, end).
|
|
|
|
|
|
|
|
|
|
In fact, under the presupposition that we will always be working with relatively big dataframes,
|
|
|
|
|
the requirement of being given a complete list of groups becomes unnecessary.
|
|
|
|
|
cf* the highlighted comment below.
|
|
|
|
|
|
|
|
|
|
Heuristics
|
|
|
|
|
----------
|
|
|
|
|
1) In the category of unlock sequences, the following sequences were counted:
|
|
|
|
|
i) 0130(0...)2
|
|
|
|
|
This is the paradigmatic case. It is allowed for the screen status 0 (off)
|
|
|
|
|
to be reported multiple times in a row.
|
|
|
|
|
ii) 21302
|
|
|
|
|
If the previous sequence has ended with the screen status 2 (e.g. unlock),
|
|
|
|
|
the unlock sequence does not start with a 0 but rather with a 2.
|
|
|
|
|
iii) (0|2)3102
|
|
|
|
|
It is allowed fot the order of 3 and 1 to be reversed. If the device is
|
|
|
|
|
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
|
|
|
|
|
precedes the ON status.
|
|
|
|
|
2) In the category of screen-check sequences, the following sequences were counted:
|
|
|
|
|
i) 010
|
|
|
|
|
The base case.
|
|
|
|
|
ii) 210
|
|
|
|
|
Refer to point 1) ii).
|
|
|
|
|
3) Special cases:
|
|
|
|
|
i) (2|0)102
|
|
|
|
|
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
|
|
|
|
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
|
|
|
|
Since the time interval between the last two events of this sequence is commonly very
|
|
|
|
|
short (around 30ms), the 2 at the end should be interpreted as part of the SCREEN-CHECK
|
|
|
|
|
SEQUENCE.
|
|
|
|
|
ii) (2|0)130102
|
|
|
|
|
This sequence is interpreted as a nested screen-check sequence (010) inside
|
|
|
|
|
an unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
|
|
|
|
|
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
|
|
|
|
|
proper check sequence and we therefore interpret the whole sequence as an UNLOCK SEQUENCE.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# If the time structure of sequences is not of interest, all events should be assigned to the same group
|
|
|
|
|
if not grouping:
|
|
|
|
|
df_screen["group"] = 0
|
|
|
|
|
|
|
|
|
|
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
|
|
|
|
|
|
|
|
|
# Create a df containing for each device a row with the following columns:
|
|
|
|
|
# - participant_id: the id of the participant the device belongs to
|
|
|
|
|
# - screen_status:
|
|
|
|
|
# a string representing the sequence of screen events
|
|
|
|
|
# in chronological order, e.g. "01301302130202130..."
|
|
|
|
|
# - timestamp:
|
|
|
|
|
# a list of timestamps of screen events
|
|
|
|
|
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
|
|
|
|
# - group:
|
|
|
|
|
# a list of groups to which the screen events
|
|
|
|
|
# belong, again, in cronological order
|
|
|
|
|
|
|
|
|
|
df_sequences_timestamps_groups = (
|
|
|
|
|
df_screen.groupby(["device_id", "participant_id"])
|
|
|
|
|
.agg({
|
|
|
|
|
"screen_status": lambda list_: "".join([str(x) for x in list_]),
|
|
|
|
|
"timestamp": list,
|
|
|
|
|
"group": list})
|
|
|
|
|
.reset_index()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Regex patterns implementing the heuristics described in the docstring.
|
|
|
|
|
# Since the matching sequences can overlap, lookahead is used. Note that
|
|
|
|
|
# the first event in a sequence isn't part of the group caught inside the
|
|
|
|
|
# lookahead. That's because the first event in a sequence is also the last
|
|
|
|
|
# event of the previous sequence, so that the time interval between the first
|
|
|
|
|
# and the second event in a sequence is actually the time the device is not in use.
|
|
|
|
|
|
|
|
|
|
unlock_pat = re.compile(
|
|
|
|
|
# Begin the lookahead group. Inside the lookahead group,
|
|
|
|
|
# first match either a 0 or a 2
|
|
|
|
|
r"(?=[0,2]"
|
|
|
|
|
# Begin the 1st capturing group, this is the one we are interested in.
|
|
|
|
|
# Match either a 13 or a 31.
|
|
|
|
|
r"((13|31)"
|
|
|
|
|
# Match either a (nonzero) sequence of consecutive 0s or a 010. Than, match
|
|
|
|
|
# a two. End the 1st capturing group. End the lookahead group.
|
|
|
|
|
r"(0+|010)2))"
|
|
|
|
|
)
|
|
|
|
|
check_pat = re.compile(
|
|
|
|
|
# Begin the lookahead group. Inside the lookahead group,
|
|
|
|
|
# first match either a 0 or a 2
|
|
|
|
|
r"(?=[0,2]"
|
|
|
|
|
# Begin the 1st capturing group. Capture a 1 succeeded by several 0s.
|
|
|
|
|
# End the 1st captouring group. End the lookahead group.
|
|
|
|
|
r"(10+))"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Enumerate the groups based on increasing order in order to make iteration easier.
|
|
|
|
|
# //! N.B.: this is also a possible way to ease the constraint on the groups
|
|
|
|
|
# //! discussed in the docstring under "Grouping". Namely, when working with
|
|
|
|
|
# //! reasonably big dataframes, it can be confidently expected that for each
|
|
|
|
|
# //! group we are interested in there will be at least one screen event assigned
|
|
|
|
|
# //! to it. In this case, the following procedure will extract the complete list of groups.
|
|
|
|
|
|
|
|
|
|
def enumerate_groups(df: pd.DataFrame) -> Tuple[dict, dict]:
|
|
|
|
|
groups_list = sorted(list(set(df["group"].tolist())))
|
|
|
|
|
group_dict = dict(enumerate(groups_list))
|
|
|
|
|
inv_group_dict = dict([(snd, fst) for (fst, snd) in group_dict.items()])
|
|
|
|
|
return group_dict, inv_group_dict
|
|
|
|
|
|
|
|
|
|
# Try to sort and enumerate groups and raise an error if impossible
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
group_dict, inv_group_dict = enumerate_groups(df_screen)
|
|
|
|
|
except TypeError as e:
|
|
|
|
|
raise e("Values in the column 'group' must be of a comparable type")
|
|
|
|
|
|
|
|
|
|
# Iterate over rows of the merged df and then for each row iterate over
|
|
|
|
|
# regex mathes. For each match, create a dictionary containing information
|
|
|
|
|
# on the matched sequence and append it to the list of rows. Lastly, create
|
|
|
|
|
# a new dataframe from the list of rows and return it.
|
|
|
|
|
|
|
|
|
|
seq_id = 0
|
|
|
|
|
|
|
|
|
|
def identify_seq(regexp: re.Pattern, label: str) -> list:
|
|
|
|
|
"""
|
|
|
|
|
Iterates over rows of df_sequences_timestamps_groups, then, for each group
|
|
|
|
|
iterates over regex matches. For each regex match, i.e. for each identyfied
|
|
|
|
|
sequence, iterates over the groups over which the sequence spans (*cf docstring).
|
|
|
|
|
"""
|
|
|
|
|
nonlocal seq_id
|
|
|
|
|
rows_list = list()
|
|
|
|
|
for index, row in df_sequences_timestamps_groups.iterrows():
|
|
|
|
|
for match in regexp.finditer(row["screen_status"]):
|
|
|
|
|
beginning_index_abs, end_index_abs = match.start(1), match.end(1)
|
|
|
|
|
groups = set(row["group"][beginning_index_abs : end_index_abs])
|
|
|
|
|
group_ids = {inv_group_dict[grp] for grp in groups}
|
|
|
|
|
# TODO Here's part of the problem: the span of relevant groups consists solely
|
|
|
|
|
# TODO of those to which at least one screen event is assigned (in the whole df)
|
|
|
|
|
span = range(min(group_ids), max(group_ids) + 1)
|
|
|
|
|
for grp_id in span:
|
|
|
|
|
grp = group_dict[grp_id]
|
|
|
|
|
grp_indices = [
|
|
|
|
|
index for index
|
|
|
|
|
in range(beginning_index_abs, end_index_abs)
|
|
|
|
|
if row["group"][index] == grp
|
|
|
|
|
]
|
|
|
|
|
# TODO Here, we face the converse problem. It may happen that a sequence in fact
|
|
|
|
|
# TODO does span over a certain group although none of the events that constitute
|
|
|
|
|
# TODO it are assigned to it. In this case, there is no way to calculate the relative
|
|
|
|
|
# TODO beginning and end (which should just be the beginning and end of the group).
|
|
|
|
|
try:
|
|
|
|
|
beginning_index_rel, end_index_rel = min(grp_indices), max(grp_indices)
|
|
|
|
|
beginning_rel = row["timestamp"][beginning_index_rel]
|
|
|
|
|
end_rel = row["timestamp"][end_index_rel]
|
|
|
|
|
except ValueError:
|
|
|
|
|
beginning_rel = end_rel = pd.NA
|
|
|
|
|
beginning_abs = row["timestamp"][beginning_index_abs]
|
|
|
|
|
end_abs = row["timestamp"][end_index_abs - 1]
|
|
|
|
|
new_row_dict = {
|
|
|
|
|
"participant_id": row["participant_id"],
|
|
|
|
|
"device_id": row["device_id"],
|
|
|
|
|
"seq_id": seq_id,
|
|
|
|
|
"sequence_type": label,
|
|
|
|
|
"group": grp,
|
|
|
|
|
"beginning_abs": beginning_abs,
|
|
|
|
|
"end_abs": end_abs,
|
|
|
|
|
"duration_abs": end_abs - beginning_abs,
|
|
|
|
|
"beginning_rel": beginning_rel,
|
|
|
|
|
"end_rel": end_rel,
|
|
|
|
|
"duration_rel": end_rel - beginning_rel
|
|
|
|
|
}
|
|
|
|
|
rows_list.append(new_row_dict)
|
|
|
|
|
seq_id += 1
|
|
|
|
|
return rows_list
|
|
|
|
|
|
|
|
|
|
rows_unlock = identify_seq(unlock_pat, "unlock")
|
|
|
|
|
rows_check = identify_seq(check_pat, "check")
|
|
|
|
|
|
|
|
|
|
df_sequences = pd.DataFrame(rows_unlock + rows_check)
|
|
|
|
|
|
|
|
|
|
return df_sequences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
# TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
|
|
|
|
|
# For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
|
|
|
|
|
# i.e. how long the screen was unlocked.
|
|
|
|
|
# Another example might be the average time between screen unlocks and/or screen status checks.
|
|
|
|
|
pass
|
|
|
|
|
# def time_screen_sequence(df_screen: pd.DataFrame, groupby: str = "date") -> pd.DataFrame:
|
|
|
|
|
# """
|
|
|
|
|
# Calculates time statistics related to device usage.
|
|
|
|
|
|
|
|
|
|
# Parameters
|
|
|
|
|
# ----------
|
|
|
|
|
# df_screen: pd.DataFrame
|
|
|
|
|
# A dataframe containing screen data
|
|
|
|
|
|
|
|
|
|
# Returns
|
|
|
|
|
# -------
|
|
|
|
|
# A new dataframe indexed by device_id and participant_id containing the followig collumns:
|
|
|
|
|
# - total_usage_time: sum of daily timespans between the last and
|
|
|
|
|
# the first event reported by the screen sensor measured in milliseconds
|
|
|
|
|
# - real_usage_time: duration of time during which the device was actually in use,
|
|
|
|
|
# i.e. the total duration of sequences identified by the function identify_screen_sequence
|
|
|
|
|
# - real_usage_time_percentage: real_usage_time / total_usage_time
|
|
|
|
|
# - average_time_between_unlocks
|
|
|
|
|
# - average_time_between_checks
|
|
|
|
|
# - average_check_duration
|
|
|
|
|
# - average_unlock_duration
|
|
|
|
|
# """
|
|
|
|
|
|
|
|
|
|
# sequences_df = identify_screen_sequence(df_screen)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # Calculate the date of the beginning and of the end of a sequence.
|
|
|
|
|
# # Drop those sequences which span over several days.
|
|
|
|
|
# sequences_df["date_beginning"] = pd.to_datetime(
|
|
|
|
|
# sequences_df.beginning, unit="ms").dt.date
|
|
|
|
|
|
|
|
|
|
# sequences_df["date_end"] = pd.to_datetime(
|
|
|
|
|
# sequences_df.end, unit="ms").dt.date
|
|
|
|
|
|
|
|
|
|
# sequences_df = (
|
|
|
|
|
# sequences_df
|
|
|
|
|
# [sequences_df["date_beginning"] == sequences_df["date_end"]]
|
|
|
|
|
# .drop(columns=["date_end"])
|
|
|
|
|
# .rename(columns={"date_beginning":"date"})
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # Calculate the time the device was in use
|
|
|
|
|
# usage_time_df = (
|
|
|
|
|
# sequences_df.groupby(["sequence_type", "participant_id", "device_id", "date"])
|
|
|
|
|
# .agg({"duration":"sum"})
|
|
|
|
|
# .apply(lambda x: x//1000, "columns")
|
|
|
|
|
# .rename(columns={"duration":"usage_time"})
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # Calculate the average time between sequences
|
|
|
|
|
# average_timedelta_df = (
|
|
|
|
|
# sequences_df.sort_values("beginning")
|
|
|
|
|
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
|
|
|
|
# .apply(
|
|
|
|
|
# lambda grp:
|
|
|
|
|
# grp.assign(end_shifted = grp["end"].shift(1))
|
|
|
|
|
# )
|
|
|
|
|
# .drop(columns=["participant_id", "device_id", "sequence_type", "date"])
|
|
|
|
|
# .droplevel(-1)
|
|
|
|
|
# .assign(average_timedelta = lambda x: x.beginning - x.end_shifted)
|
|
|
|
|
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
|
|
|
|
# .agg({"average_timedelta": lambda x: np.mean(x)//1000})
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # Calculate the average duration of sequences
|
|
|
|
|
# average_duration_df = (
|
|
|
|
|
# sequences_df
|
|
|
|
|
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
|
|
|
|
# .agg({"duration": (lambda x: np.mean(x)//1000)})
|
|
|
|
|
# .rename(columns={"duration":"average_duration"})
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # Merge into a single dataframe
|
|
|
|
|
# merged = pd.merge(
|
|
|
|
|
# pd.merge(usage_time_df, average_timedelta_df, left_index=True, right_index=True),
|
|
|
|
|
# average_duration_df,
|
|
|
|
|
# left_index=True,
|
|
|
|
|
# right_index=True
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# return merged
|
|
|
|
|