defined func identify_screen_sequences

screen_sequences
Ivan Kobe 2021-09-10 16:47:37 +02:00
parent e2e268148d
commit cf7e692927
4 changed files with 826 additions and 13 deletions

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.11.2 # jupytext_version: 1.11.5
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -38,6 +38,12 @@ print(df_screen_nokia)
participants_inactive_usernames = participants.query_db.get_usernames() participants_inactive_usernames = participants.query_db.get_usernames()
df_screen_inactive = get_screen_data(participants_inactive_usernames) df_screen_inactive = get_screen_data(participants_inactive_usernames)
# %%
df_screen_inactive.head(60)
# %%
df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
# %% # %%
df_screen_inactive["screen_status"] = ( df_screen_inactive["screen_status"] = (
df_screen_inactive["screen_status"] df_screen_inactive["screen_status"]

View File

@ -1,6 +1,7 @@
from collections.abc import Collection from collections.abc import Collection
import pandas as pd import pandas as pd
import re
from config.models import Participant, Screen from config.models import Participant, Screen
from setup import db_engine, session from setup import db_engine, session
@ -33,17 +34,139 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame: def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# TODO Implement a method that identifies "interesting" sequences of screen statuses. """
# The main one are: Identify interesting sequences (unlock, status check) and return them in a dataframe.
# - OFF -> ON -> unlocked (a true phone unlock)
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check) Parameters
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb: ----------
# "I have also seen df_screen: pd.DataFrame
# off -> on -> unlocked (with 2 - locked missing) A dataframe containing screen data
# and
# off -> locked -> on -> off -> locked (*again*)." Returns
# Either clean the data beforehand or deal with these inconsistencies in this function. -------
pass df_sequences: pd.DataFrame
A dataframe containing information on screen sequences
Columns:
* participant_id
* device_id
* sequence_type: unlock/check
* beginning: beginning of unlock/check in miliseconds since 1970
* end: end of unlock/check in miliseconds since 1970
* duration
Heuristics
----------
1) In the category of unlock sequences, the following sequences were counted:
i) 0130(0...)2
This is the paradigmatic case. It is allowed for the screen status 0 (off)
to be reported multiple times in a row.
ii) 21302
If the previous sequence has ended with the screen status 2 (e.g. unlock),
the unlock sequence does not start with a 0 but rather with a 2.
iii) (0|2)3102
It is allowed fot the order of 3 and 1 to be reversed. If the device is
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
precedes the ON status.
2) In the category of screen-check sequences, the following sequences were counted:
i) 010
The base case.
ii) 210
Refer to point 1) ii).
3) Special cases:
i) (2|0)102
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
is an inconsistency, however due to its frequency it has to be dealt with in some way.
Since the time interval between the last two events of this sequence is commonly very
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
sequence.
ii) (2|0)130102
This sequence is interpreted as a nested screen-check sequence (010) inside
a unlock sequence ((2|0)1302).
"""
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
groups = df_screen.groupby("device_id")
# Create a df containing, for each device, a string representing the sequence of
# screen events in chronological order, e.g. "01301302130202130..."
df_screen_sequences = (
groups["screen_status"]
.apply(list)
.apply(lambda list_: "".join([str(x) for x in list_]))
.to_frame()
.reset_index()
)
# Create a df containing, for each device, a list of timestamps of screen events
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
df_timestamps = (
groups["timestamp"]
.apply(list)
.to_frame()
.reset_index()
)
# Create a df containing information to which participant the devices belong
df_participants = (
df_screen[["device_id", "participant_id"]]
.drop_duplicates()
.reset_index()
.drop("index", 1)
)
df_merged = (
df_screen_sequences.merge(df_timestamps, on="device_id")
.merge(df_participants, on="device_id")
)
# Regex patterns implementing the heuristics described in the docstring.
# Since the matching sequences can overlap, lookahead is used. Note that
# the first event in a sequence isn't part of the group caught inside the
# lookahead. That's because the first event in a sequence is also the last
# event of the previous sequence, so that the time interval between the first
# and the second event in a sequence is actually the time the device is not in use.
unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
check_pat = re.compile(r"(?=[0,2](10+2?))")
# Iterate over rows of the merged df and then for each row iterate over
# regex mathes. For each match, create a dictionary containing information
# on the matched sequence and append it to the list of rows. Lastly, create
# a new dataframe from the list of rows and return it.
rows_list = list()
for index, row in df_merged.iterrows():
for match in unlock_pat.finditer(row["screen_status"]):
beginning = row["timestamp"][match.start(1)]
end = row["timestamp"][match.end(1) - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"sequence_type": "unlock",
"beginning": beginning,
"end": end
}
rows_list.append(new_row_dict)
for match in check_pat.finditer(row["screen_status"]):
beginning = row["timestamp"][match.start(1)]
end = row["timestamp"][match.end(1) - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"sequence_type": "check",
"beginning": beginning,
"end": end
}
rows_list.append(new_row_dict)
df_sequences = pd.DataFrame(rows_list)
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
return df_sequences
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame: def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:

File diff suppressed because one or more lines are too long