defined func identify_screen_sequences
parent
e2e268148d
commit
cf7e692927
|
@ -19,4 +19,4 @@ dependencies:
|
|||
- scikit-learn
|
||||
- sqlalchemy
|
||||
- statsmodels
|
||||
- tabulate
|
||||
- tabulate
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.2
|
||||
# jupytext_version: 1.11.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -38,6 +38,12 @@ print(df_screen_nokia)
|
|||
participants_inactive_usernames = participants.query_db.get_usernames()
|
||||
df_screen_inactive = get_screen_data(participants_inactive_usernames)
|
||||
|
||||
# %%
|
||||
df_screen_inactive.head(60)
|
||||
|
||||
# %%
|
||||
df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
|
||||
|
||||
# %%
|
||||
df_screen_inactive["screen_status"] = (
|
||||
df_screen_inactive["screen_status"]
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from collections.abc import Collection
|
||||
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
from config.models import Participant, Screen
|
||||
from setup import db_engine, session
|
||||
|
@ -33,17 +34,139 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
|
|||
|
||||
|
||||
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||
# TODO Implement a method that identifies "interesting" sequences of screen statuses.
|
||||
# The main one are:
|
||||
# - OFF -> ON -> unlocked (a true phone unlock)
|
||||
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
|
||||
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
|
||||
# "I have also seen
|
||||
# off -> on -> unlocked (with 2 - locked missing)
|
||||
# and
|
||||
# off -> locked -> on -> off -> locked (*again*)."
|
||||
# Either clean the data beforehand or deal with these inconsistencies in this function.
|
||||
pass
|
||||
"""
|
||||
Identify interesting sequences (unlock, status check) and return them in a dataframe.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_screen: pd.DataFrame
|
||||
A dataframe containing screen data
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_sequences: pd.DataFrame
|
||||
A dataframe containing information on screen sequences
|
||||
|
||||
Columns:
|
||||
* participant_id
|
||||
* device_id
|
||||
* sequence_type: unlock/check
|
||||
* beginning: beginning of unlock/check in miliseconds since 1970
|
||||
* end: end of unlock/check in miliseconds since 1970
|
||||
* duration
|
||||
|
||||
Heuristics
|
||||
----------
|
||||
1) In the category of unlock sequences, the following sequences were counted:
|
||||
i) 0130(0...)2
|
||||
This is the paradigmatic case. It is allowed for the screen status 0 (off)
|
||||
to be reported multiple times in a row.
|
||||
ii) 21302
|
||||
If the previous sequence has ended with the screen status 2 (e.g. unlock),
|
||||
the unlock sequence does not start with a 0 but rather with a 2.
|
||||
iii) (0|2)3102
|
||||
It is allowed fot the order of 3 and 1 to be reversed. If the device is
|
||||
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
|
||||
precedes the ON status.
|
||||
2) In the category of screen-check sequences, the following sequences were counted:
|
||||
i) 010
|
||||
The base case.
|
||||
ii) 210
|
||||
Refer to point 1) ii).
|
||||
3) Special cases:
|
||||
i) (2|0)102
|
||||
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
||||
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
||||
Since the time interval between the last two events of this sequence is commonly very
|
||||
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
|
||||
sequence.
|
||||
ii) (2|0)130102
|
||||
This sequence is interpreted as a nested screen-check sequence (010) inside
|
||||
a unlock sequence ((2|0)1302).
|
||||
"""
|
||||
|
||||
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
||||
|
||||
groups = df_screen.groupby("device_id")
|
||||
|
||||
# Create a df containing, for each device, a string representing the sequence of
|
||||
# screen events in chronological order, e.g. "01301302130202130..."
|
||||
|
||||
df_screen_sequences = (
|
||||
groups["screen_status"]
|
||||
.apply(list)
|
||||
.apply(lambda list_: "".join([str(x) for x in list_]))
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Create a df containing, for each device, a list of timestamps of screen events
|
||||
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
||||
|
||||
df_timestamps = (
|
||||
groups["timestamp"]
|
||||
.apply(list)
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Create a df containing information to which participant the devices belong
|
||||
|
||||
df_participants = (
|
||||
df_screen[["device_id", "participant_id"]]
|
||||
.drop_duplicates()
|
||||
.reset_index()
|
||||
.drop("index", 1)
|
||||
)
|
||||
|
||||
df_merged = (
|
||||
df_screen_sequences.merge(df_timestamps, on="device_id")
|
||||
.merge(df_participants, on="device_id")
|
||||
)
|
||||
|
||||
# Regex patterns implementing the heuristics described in the docstring.
|
||||
# Since the matching sequences can overlap, lookahead is used. Note that
|
||||
# the first event in a sequence isn't part of the group caught inside the
|
||||
# lookahead. That's because the first event in a sequence is also the last
|
||||
# event of the previous sequence, so that the time interval between the first
|
||||
# and the second event in a sequence is actually the time the device is not in use.
|
||||
|
||||
unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
|
||||
check_pat = re.compile(r"(?=[0,2](10+2?))")
|
||||
|
||||
# Iterate over rows of the merged df and then for each row iterate over
|
||||
# regex mathes. For each match, create a dictionary containing information
|
||||
# on the matched sequence and append it to the list of rows. Lastly, create
|
||||
# a new dataframe from the list of rows and return it.
|
||||
|
||||
rows_list = list()
|
||||
for index, row in df_merged.iterrows():
|
||||
for match in unlock_pat.finditer(row["screen_status"]):
|
||||
beginning = row["timestamp"][match.start(1)]
|
||||
end = row["timestamp"][match.end(1) - 1]
|
||||
new_row_dict = {
|
||||
"participant_id": row["participant_id"],
|
||||
"device_id": row["device_id"],
|
||||
"sequence_type": "unlock",
|
||||
"beginning": beginning,
|
||||
"end": end
|
||||
}
|
||||
rows_list.append(new_row_dict)
|
||||
for match in check_pat.finditer(row["screen_status"]):
|
||||
beginning = row["timestamp"][match.start(1)]
|
||||
end = row["timestamp"][match.end(1) - 1]
|
||||
new_row_dict = {
|
||||
"participant_id": row["participant_id"],
|
||||
"device_id": row["device_id"],
|
||||
"sequence_type": "check",
|
||||
"beginning": beginning,
|
||||
"end": end
|
||||
}
|
||||
rows_list.append(new_row_dict)
|
||||
df_sequences = pd.DataFrame(rows_list)
|
||||
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
|
||||
|
||||
return df_sequences
|
||||
|
||||
|
||||
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue