178 lines
6.5 KiB
Python
178 lines
6.5 KiB
Python
from collections.abc import Collection
|
|
|
|
import pandas as pd
|
|
import re
|
|
|
|
from config.models import Participant, Screen
|
|
from setup import db_engine, session
|
|
|
|
screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}
|
|
|
|
|
|
def get_screen_data(usernames: Collection) -> pd.DataFrame:
|
|
"""
|
|
Read the data from the screen table and return it in a dataframe.
|
|
|
|
Parameters
|
|
----------
|
|
usernames: Collection
|
|
A list of usernames to put into the WHERE condition.
|
|
|
|
Returns
|
|
-------
|
|
df_screen: pd.DataFrame
|
|
A dataframe of screen data.
|
|
"""
|
|
query_screen = (
|
|
session.query(Screen, Participant.username)
|
|
.filter(Participant.id == Screen.participant_id)
|
|
.filter(Participant.username.in_(usernames))
|
|
)
|
|
with db_engine.connect() as connection:
|
|
df_screen = pd.read_sql(query_screen.statement, connection)
|
|
return df_screen
|
|
|
|
|
|
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Identify interesting sequences (unlock, status check) and return them in a dataframe.
|
|
|
|
Parameters
|
|
----------
|
|
df_screen: pd.DataFrame
|
|
A dataframe containing screen data
|
|
|
|
Returns
|
|
-------
|
|
df_sequences: pd.DataFrame
|
|
A dataframe containing information on screen sequences
|
|
|
|
Columns:
|
|
* participant_id
|
|
* device_id
|
|
* sequence_type: unlock/check
|
|
* beginning: beginning of unlock/check in miliseconds since 1970
|
|
* end: end of unlock/check in miliseconds since 1970
|
|
* duration
|
|
|
|
Heuristics
|
|
----------
|
|
1) In the category of unlock sequences, the following sequences were counted:
|
|
i) 0130(0...)2
|
|
This is the paradigmatic case. It is allowed for the screen status 0 (off)
|
|
to be reported multiple times in a row.
|
|
ii) 21302
|
|
If the previous sequence has ended with the screen status 2 (e.g. unlock),
|
|
the unlock sequence does not start with a 0 but rather with a 2.
|
|
iii) (0|2)3102
|
|
It is allowed fot the order of 3 and 1 to be reversed. If the device is
|
|
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
|
|
precedes the ON status.
|
|
2) In the category of screen-check sequences, the following sequences were counted:
|
|
i) 010
|
|
The base case.
|
|
ii) 210
|
|
Refer to point 1) ii).
|
|
3) Special cases:
|
|
i) (2|0)102
|
|
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
|
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
|
Since the time interval between the last two events of this sequence is commonly very
|
|
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
|
|
sequence.
|
|
ii) (2|0)130102
|
|
This sequence is interpreted as a nested screen-check sequence (010) inside
|
|
a unlock sequence ((2|0)1302).
|
|
"""
|
|
|
|
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
|
|
|
groups = df_screen.groupby("device_id")
|
|
|
|
# Create a df containing, for each device, a string representing the sequence of
|
|
# screen events in chronological order, e.g. "01301302130202130..."
|
|
|
|
df_screen_sequences = (
|
|
groups["screen_status"]
|
|
.apply(list)
|
|
.apply(lambda list_: "".join([str(x) for x in list_]))
|
|
.to_frame()
|
|
.reset_index()
|
|
)
|
|
|
|
# Create a df containing, for each device, a list of timestamps of screen events
|
|
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
|
|
|
df_timestamps = (
|
|
groups["timestamp"]
|
|
.apply(list)
|
|
.to_frame()
|
|
.reset_index()
|
|
)
|
|
|
|
# Create a df containing information to which participant the devices belong
|
|
|
|
df_participants = (
|
|
df_screen[["device_id", "participant_id"]]
|
|
.drop_duplicates()
|
|
.reset_index()
|
|
.drop("index", 1)
|
|
)
|
|
|
|
df_merged = (
|
|
df_screen_sequences.merge(df_timestamps, on="device_id")
|
|
.merge(df_participants, on="device_id")
|
|
)
|
|
|
|
# Regex patterns implementing the heuristics described in the docstring.
|
|
# Since the matching sequences can overlap, lookahead is used. Note that
|
|
# the first event in a sequence isn't part of the group caught inside the
|
|
# lookahead. That's because the first event in a sequence is also the last
|
|
# event of the previous sequence, so that the time interval between the first
|
|
# and the second event in a sequence is actually the time the device is not in use.
|
|
|
|
unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
|
|
check_pat = re.compile(r"(?=[0,2](10+2?))")
|
|
|
|
# Iterate over rows of the merged df and then for each row iterate over
|
|
# regex mathes. For each match, create a dictionary containing information
|
|
# on the matched sequence and append it to the list of rows. Lastly, create
|
|
# a new dataframe from the list of rows and return it.
|
|
|
|
rows_list = list()
|
|
for index, row in df_merged.iterrows():
|
|
for match in unlock_pat.finditer(row["screen_status"]):
|
|
beginning = row["timestamp"][match.start(1)]
|
|
end = row["timestamp"][match.end(1) - 1]
|
|
new_row_dict = {
|
|
"participant_id": row["participant_id"],
|
|
"device_id": row["device_id"],
|
|
"sequence_type": "unlock",
|
|
"beginning": beginning,
|
|
"end": end
|
|
}
|
|
rows_list.append(new_row_dict)
|
|
for match in check_pat.finditer(row["screen_status"]):
|
|
beginning = row["timestamp"][match.start(1)]
|
|
end = row["timestamp"][match.end(1) - 1]
|
|
new_row_dict = {
|
|
"participant_id": row["participant_id"],
|
|
"device_id": row["device_id"],
|
|
"sequence_type": "check",
|
|
"beginning": beginning,
|
|
"end": end
|
|
}
|
|
rows_list.append(new_row_dict)
|
|
df_sequences = pd.DataFrame(rows_list)
|
|
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
|
|
|
|
return df_sequences
|
|
|
|
|
|
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
|
# TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
|
|
# For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
|
|
# i.e. how long the screen was unlocked.
|
|
# Another example might be the average time between screen unlocks and/or screen status checks.
|
|
pass
|