defined func identify_screen_sequences
parent
e2e268148d
commit
cf7e692927
|
@ -19,4 +19,4 @@ dependencies:
|
||||||
- scikit-learn
|
- scikit-learn
|
||||||
- sqlalchemy
|
- sqlalchemy
|
||||||
- statsmodels
|
- statsmodels
|
||||||
- tabulate
|
- tabulate
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.11.2
|
# jupytext_version: 1.11.5
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -38,6 +38,12 @@ print(df_screen_nokia)
|
||||||
participants_inactive_usernames = participants.query_db.get_usernames()
|
participants_inactive_usernames = participants.query_db.get_usernames()
|
||||||
df_screen_inactive = get_screen_data(participants_inactive_usernames)
|
df_screen_inactive = get_screen_data(participants_inactive_usernames)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_screen_inactive.head(60)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_screen_inactive["screen_status"] = (
|
df_screen_inactive["screen_status"] = (
|
||||||
df_screen_inactive["screen_status"]
|
df_screen_inactive["screen_status"]
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from collections.abc import Collection
|
from collections.abc import Collection
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
|
||||||
from config.models import Participant, Screen
|
from config.models import Participant, Screen
|
||||||
from setup import db_engine, session
|
from setup import db_engine, session
|
||||||
|
@ -33,17 +34,139 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
|
||||||
|
|
||||||
|
|
||||||
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||||
# TODO Implement a method that identifies "interesting" sequences of screen statuses.
|
"""
|
||||||
# The main one are:
|
Identify interesting sequences (unlock, status check) and return them in a dataframe.
|
||||||
# - OFF -> ON -> unlocked (a true phone unlock)
|
|
||||||
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
|
Parameters
|
||||||
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
|
----------
|
||||||
# "I have also seen
|
df_screen: pd.DataFrame
|
||||||
# off -> on -> unlocked (with 2 - locked missing)
|
A dataframe containing screen data
|
||||||
# and
|
|
||||||
# off -> locked -> on -> off -> locked (*again*)."
|
Returns
|
||||||
# Either clean the data beforehand or deal with these inconsistencies in this function.
|
-------
|
||||||
pass
|
df_sequences: pd.DataFrame
|
||||||
|
A dataframe containing information on screen sequences
|
||||||
|
|
||||||
|
Columns:
|
||||||
|
* participant_id
|
||||||
|
* device_id
|
||||||
|
* sequence_type: unlock/check
|
||||||
|
* beginning: beginning of unlock/check in miliseconds since 1970
|
||||||
|
* end: end of unlock/check in miliseconds since 1970
|
||||||
|
* duration
|
||||||
|
|
||||||
|
Heuristics
|
||||||
|
----------
|
||||||
|
1) In the category of unlock sequences, the following sequences were counted:
|
||||||
|
i) 0130(0...)2
|
||||||
|
This is the paradigmatic case. It is allowed for the screen status 0 (off)
|
||||||
|
to be reported multiple times in a row.
|
||||||
|
ii) 21302
|
||||||
|
If the previous sequence has ended with the screen status 2 (e.g. unlock),
|
||||||
|
the unlock sequence does not start with a 0 but rather with a 2.
|
||||||
|
iii) (0|2)3102
|
||||||
|
It is allowed fot the order of 3 and 1 to be reversed. If the device is
|
||||||
|
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
|
||||||
|
precedes the ON status.
|
||||||
|
2) In the category of screen-check sequences, the following sequences were counted:
|
||||||
|
i) 010
|
||||||
|
The base case.
|
||||||
|
ii) 210
|
||||||
|
Refer to point 1) ii).
|
||||||
|
3) Special cases:
|
||||||
|
i) (2|0)102
|
||||||
|
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
||||||
|
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
||||||
|
Since the time interval between the last two events of this sequence is commonly very
|
||||||
|
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
|
||||||
|
sequence.
|
||||||
|
ii) (2|0)130102
|
||||||
|
This sequence is interpreted as a nested screen-check sequence (010) inside
|
||||||
|
a unlock sequence ((2|0)1302).
|
||||||
|
"""
|
||||||
|
|
||||||
|
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
||||||
|
|
||||||
|
groups = df_screen.groupby("device_id")
|
||||||
|
|
||||||
|
# Create a df containing, for each device, a string representing the sequence of
|
||||||
|
# screen events in chronological order, e.g. "01301302130202130..."
|
||||||
|
|
||||||
|
df_screen_sequences = (
|
||||||
|
groups["screen_status"]
|
||||||
|
.apply(list)
|
||||||
|
.apply(lambda list_: "".join([str(x) for x in list_]))
|
||||||
|
.to_frame()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a df containing, for each device, a list of timestamps of screen events
|
||||||
|
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
||||||
|
|
||||||
|
df_timestamps = (
|
||||||
|
groups["timestamp"]
|
||||||
|
.apply(list)
|
||||||
|
.to_frame()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a df containing information to which participant the devices belong
|
||||||
|
|
||||||
|
df_participants = (
|
||||||
|
df_screen[["device_id", "participant_id"]]
|
||||||
|
.drop_duplicates()
|
||||||
|
.reset_index()
|
||||||
|
.drop("index", 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
df_merged = (
|
||||||
|
df_screen_sequences.merge(df_timestamps, on="device_id")
|
||||||
|
.merge(df_participants, on="device_id")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Regex patterns implementing the heuristics described in the docstring.
|
||||||
|
# Since the matching sequences can overlap, lookahead is used. Note that
|
||||||
|
# the first event in a sequence isn't part of the group caught inside the
|
||||||
|
# lookahead. That's because the first event in a sequence is also the last
|
||||||
|
# event of the previous sequence, so that the time interval between the first
|
||||||
|
# and the second event in a sequence is actually the time the device is not in use.
|
||||||
|
|
||||||
|
unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
|
||||||
|
check_pat = re.compile(r"(?=[0,2](10+2?))")
|
||||||
|
|
||||||
|
# Iterate over rows of the merged df and then for each row iterate over
|
||||||
|
# regex mathes. For each match, create a dictionary containing information
|
||||||
|
# on the matched sequence and append it to the list of rows. Lastly, create
|
||||||
|
# a new dataframe from the list of rows and return it.
|
||||||
|
|
||||||
|
rows_list = list()
|
||||||
|
for index, row in df_merged.iterrows():
|
||||||
|
for match in unlock_pat.finditer(row["screen_status"]):
|
||||||
|
beginning = row["timestamp"][match.start(1)]
|
||||||
|
end = row["timestamp"][match.end(1) - 1]
|
||||||
|
new_row_dict = {
|
||||||
|
"participant_id": row["participant_id"],
|
||||||
|
"device_id": row["device_id"],
|
||||||
|
"sequence_type": "unlock",
|
||||||
|
"beginning": beginning,
|
||||||
|
"end": end
|
||||||
|
}
|
||||||
|
rows_list.append(new_row_dict)
|
||||||
|
for match in check_pat.finditer(row["screen_status"]):
|
||||||
|
beginning = row["timestamp"][match.start(1)]
|
||||||
|
end = row["timestamp"][match.end(1) - 1]
|
||||||
|
new_row_dict = {
|
||||||
|
"participant_id": row["participant_id"],
|
||||||
|
"device_id": row["device_id"],
|
||||||
|
"sequence_type": "check",
|
||||||
|
"beginning": beginning,
|
||||||
|
"end": end
|
||||||
|
}
|
||||||
|
rows_list.append(new_row_dict)
|
||||||
|
df_sequences = pd.DataFrame(rows_list)
|
||||||
|
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
|
||||||
|
|
||||||
|
return df_sequences
|
||||||
|
|
||||||
|
|
||||||
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue