stress_at_work_analysis/features/screen.py

178 lines
6.5 KiB
Python

from collections.abc import Collection
import pandas as pd
import re
from config.models import Participant, Screen
from setup import db_engine, session
screen_status = {0: "off", 1: "on", 2: "locked", 3: "unlocked"}
def get_screen_data(usernames: Collection) -> pd.DataFrame:
"""
Read the data from the screen table and return it in a dataframe.
Parameters
----------
usernames: Collection
A list of usernames to put into the WHERE condition.
Returns
-------
df_screen: pd.DataFrame
A dataframe of screen data.
"""
query_screen = (
session.query(Screen, Participant.username)
.filter(Participant.id == Screen.participant_id)
.filter(Participant.username.in_(usernames))
)
with db_engine.connect() as connection:
df_screen = pd.read_sql(query_screen.statement, connection)
return df_screen
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
"""
Identify interesting sequences (unlock, status check) and return them in a dataframe.
Parameters
----------
df_screen: pd.DataFrame
A dataframe containing screen data
Returns
-------
df_sequences: pd.DataFrame
A dataframe containing information on screen sequences
Columns:
* participant_id
* device_id
* sequence_type: unlock/check
* beginning: beginning of unlock/check in miliseconds since 1970
* end: end of unlock/check in miliseconds since 1970
* duration
Heuristics
----------
1) In the category of unlock sequences, the following sequences were counted:
i) 0130(0...)2
This is the paradigmatic case. It is allowed for the screen status 0 (off)
to be reported multiple times in a row.
ii) 21302
If the previous sequence has ended with the screen status 2 (e.g. unlock),
the unlock sequence does not start with a 0 but rather with a 2.
iii) (0|2)3102
It is allowed fot the order of 3 and 1 to be reversed. If the device is
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
precedes the ON status.
2) In the category of screen-check sequences, the following sequences were counted:
i) 010
The base case.
ii) 210
Refer to point 1) ii).
3) Special cases:
i) (2|0)102
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
is an inconsistency, however due to its frequency it has to be dealt with in some way.
Since the time interval between the last two events of this sequence is commonly very
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
sequence.
ii) (2|0)130102
This sequence is interpreted as a nested screen-check sequence (010) inside
a unlock sequence ((2|0)1302).
"""
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
groups = df_screen.groupby("device_id")
# Create a df containing, for each device, a string representing the sequence of
# screen events in chronological order, e.g. "01301302130202130..."
df_screen_sequences = (
groups["screen_status"]
.apply(list)
.apply(lambda list_: "".join([str(x) for x in list_]))
.to_frame()
.reset_index()
)
# Create a df containing, for each device, a list of timestamps of screen events
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
df_timestamps = (
groups["timestamp"]
.apply(list)
.to_frame()
.reset_index()
)
# Create a df containing information to which participant the devices belong
df_participants = (
df_screen[["device_id", "participant_id"]]
.drop_duplicates()
.reset_index()
.drop("index", 1)
)
df_merged = (
df_screen_sequences.merge(df_timestamps, on="device_id")
.merge(df_participants, on="device_id")
)
# Regex patterns implementing the heuristics described in the docstring.
# Since the matching sequences can overlap, lookahead is used. Note that
# the first event in a sequence isn't part of the group caught inside the
# lookahead. That's because the first event in a sequence is also the last
# event of the previous sequence, so that the time interval between the first
# and the second event in a sequence is actually the time the device is not in use.
unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
check_pat = re.compile(r"(?=[0,2](10+2?))")
# Iterate over rows of the merged df and then for each row iterate over
# regex mathes. For each match, create a dictionary containing information
# on the matched sequence and append it to the list of rows. Lastly, create
# a new dataframe from the list of rows and return it.
rows_list = list()
for index, row in df_merged.iterrows():
for match in unlock_pat.finditer(row["screen_status"]):
beginning = row["timestamp"][match.start(1)]
end = row["timestamp"][match.end(1) - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"sequence_type": "unlock",
"beginning": beginning,
"end": end
}
rows_list.append(new_row_dict)
for match in check_pat.finditer(row["screen_status"]):
beginning = row["timestamp"][match.start(1)]
end = row["timestamp"][match.end(1) - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"sequence_type": "check",
"beginning": beginning,
"end": end
}
rows_list.append(new_row_dict)
df_sequences = pd.DataFrame(rows_list)
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
return df_sequences
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
# For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
# i.e. how long the screen was unlocked.
# Another example might be the average time between screen unlocks and/or screen status checks.
pass