Compare commits

...

4 Commits

Author SHA1 Message Date
Primoz 0a98bab78d Ignore existing data. 2023-02-10 11:04:46 +01:00
Ivan Kobe 47ecd4bc02 partially resolved the grouping issue 2021-09-22 16:34:33 +02:00
Ivan Kobe 53df652d02 implemented time_screen_sequence 2021-09-13 11:47:19 +02:00
Ivan Kobe cf7e692927 defined func identify_screen_sequences 2021-09-10 16:47:37 +02:00
7 changed files with 1706 additions and 20 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ __pycache__/
/exploration/*.ipynb
/config/*.ipynb
/statistical_analysis/*.ipynb
data/*

View File

@ -19,4 +19,4 @@ dependencies:
- scikit-learn
- sqlalchemy
- statsmodels
- tabulate
- tabulate

View File

@ -356,7 +356,16 @@ class Proximity(Base, AWAREsensor):
class Screen(Base, AWAREsensor):
"""
Contains the screen sensor information.
Attributes
----------
screen_status: int
Screen status (0 off, 1 on, 2 locked, 3 unlocked)
"""
screen_status = Column(SmallInteger)
class SMS(Base, AWAREsensor):

View File

@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# jupytext_version: 1.11.5
# kernelspec:
# display_name: straw2analysis
# language: python
@ -38,6 +38,12 @@ print(df_screen_nokia)
participants_inactive_usernames = participants.query_db.get_usernames()
df_screen_inactive = get_screen_data(participants_inactive_usernames)
# %%
df_screen_inactive.head(60)
# %%
df_screen_inactive.to_csv(r'/home/ivan/IJS/logs/screen_data_offline.csv')
# %%
df_screen_inactive["screen_status"] = (
df_screen_inactive["screen_status"]

4
features/.vscode/settings.json vendored 100644
View File

@ -0,0 +1,4 @@
{
"python.linting.enabled": true,
"python.formatting.provider": "autopep8"
}

View File

@ -1,6 +1,10 @@
from collections.abc import Collection
import pandas as pd
import numpy as np
import re
from typing import Tuple, ValuesView
from config.models import Participant, Screen
from setup import db_engine, session
@ -32,23 +36,320 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
return df_screen
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# TODO Implement a method that identifies "interesting" sequences of screen statuses.
# The main one are:
# - OFF -> ON -> unlocked (a true phone unlock)
# - OFF -> ON -> OFF/locked (no unlocking, i.e. a screen status check)
# Consider that screen data is sometimes unreliable as shown in expl_screen.ipynb:
# "I have also seen
# off -> on -> unlocked (with 2 - locked missing)
# and
# off -> locked -> on -> off -> locked (*again*)."
# Either clean the data beforehand or deal with these inconsistencies in this function.
pass
def identify_screen_sequence(df_screen: pd.DataFrame, grouping: bool = False) -> pd.DataFrame:
"""
Identifes interesting sequences (unlock, status check) and returns them in a dataframe.
Transform the grouping of screen events (by day, hour...) into a grouping of sequences.
Parameters
----------
df_screen: pd.DataFrame
A dataframe containing screen data and a column "group".
N.B.: the values in the column "group" must be of a comparable type (e.g. int, datetime.date etc.)
grouping:
A boolean value indicating whether the input df contains the columng "group".
Returns
-------
df_sequences: pd.DataFrame:
A dataframe containing information on screen sequences
Columns:
- participant_id
- device_id
- seq_id: an unique id assigned to each sequence
- sequence_type: unlock/check
- group: the group to which the sequence belongs, i.e. the timespan during which it has
occured. Note that in the case that it spans over a longer period of time,
the same sequence is assigned to multiple groups
- beginning_abs: beginning of unlock/check [ms since 1970]
- end_abs: end of unlock/check in [ms since 1970]
- duration_abs [ms]
- beginning_rel: beginning of a sequence relative to the group [ms since 1970]
- end_rel [ms since 1970]
- duration_rel [ms since 1970]
Legend
------
- 0: off
- 1: on
- 2: locked
- 3: unlocked
Grouping
--------
If the screen events of the input df are assigned a time structure, the identified sequences should also
be. If all of the screen events constituing a sequence are in the same group, assingning the sequence to
a group is trivial - it should be the group the events belong to. If, on the other hand, the situation is
trickier. As of the moment, the procedure is implemented as follows:
The relative beginning (relative to a certain group, i.e. timespan) is defined as the timestamp of the
first event belonging to the group in question. Relative end and relative duration are defined in a
similar fashion. This is, however, not optimal. We would namely wish, e.g., that the relative durations
of a given sequence would sum up to its absolute duration which is not yet the case.
TODO In order to achieve this, we would need to be given more information on the groups. The current
TODO constraint on the groups is only that they be comparable. This is insufficient since it is
TODO impossible to infer:
TODO - how many and which groups lie between two given groups
TODO - when in time does a certain group begin and when does it end
TODO In order to mitigate these issues, we would need to be given a complete list of groups and the
TODO groups should have the form of an interval (beginning, end).
In fact, under the presupposition that we will always be working with relatively big dataframes,
the requirement of being given a complete list of groups becomes unnecessary.
cf* the highlighted comment below.
Heuristics
----------
1) In the category of unlock sequences, the following sequences were counted:
i) 0130(0...)2
This is the paradigmatic case. It is allowed for the screen status 0 (off)
to be reported multiple times in a row.
ii) 21302
If the previous sequence has ended with the screen status 2 (e.g. unlock),
the unlock sequence does not start with a 0 but rather with a 2.
iii) (0|2)3102
It is allowed fot the order of 3 and 1 to be reversed. If the device is
unlocked e.g. with a fingerprint-reader, it can happen that the unlock
precedes the ON status.
2) In the category of screen-check sequences, the following sequences were counted:
i) 010
The base case.
ii) 210
Refer to point 1) ii).
3) Special cases:
i) (2|0)102
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
is an inconsistency, however due to its frequency it has to be dealt with in some way.
Since the time interval between the last two events of this sequence is commonly very
short (around 30ms), the 2 at the end should be interpreted as part of the SCREEN-CHECK
SEQUENCE.
ii) (2|0)130102
This sequence is interpreted as a nested screen-check sequence (010) inside
an unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
proper check sequence and we therefore interpret the whole sequence as an UNLOCK SEQUENCE.
"""
# If the time structure of sequences is not of interest, all events should be assigned to the same group
if not grouping:
df_screen["group"] = 0
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
# Create a df containing for each device a row with the following columns:
# - participant_id: the id of the participant the device belongs to
# - screen_status:
# a string representing the sequence of screen events
# in chronological order, e.g. "01301302130202130..."
# - timestamp:
# a list of timestamps of screen events
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
# - group:
# a list of groups to which the screen events
# belong, again, in cronological order
df_sequences_timestamps_groups = (
df_screen.groupby(["device_id", "participant_id"])
.agg({
"screen_status": lambda list_: "".join([str(x) for x in list_]),
"timestamp": list,
"group": list})
.reset_index()
)
# Regex patterns implementing the heuristics described in the docstring.
# Since the matching sequences can overlap, lookahead is used. Note that
# the first event in a sequence isn't part of the group caught inside the
# lookahead. That's because the first event in a sequence is also the last
# event of the previous sequence, so that the time interval between the first
# and the second event in a sequence is actually the time the device is not in use.
unlock_pat = re.compile(
# Begin the lookahead group. Inside the lookahead group,
# first match either a 0 or a 2
r"(?=[0,2]"
# Begin the 1st capturing group, this is the one we are interested in.
# Match either a 13 or a 31.
r"((13|31)"
# Match either a (nonzero) sequence of consecutive 0s or a 010. Than, match
# a two. End the 1st capturing group. End the lookahead group.
r"(0+|010)2))"
)
check_pat = re.compile(
# Begin the lookahead group. Inside the lookahead group,
# first match either a 0 or a 2
r"(?=[0,2]"
# Begin the 1st capturing group. Capture a 1 succeeded by several 0s.
# End the 1st captouring group. End the lookahead group.
r"(10+))"
)
# Enumerate the groups based on increasing order in order to make iteration easier.
# //! N.B.: this is also a possible way to ease the constraint on the groups
# //! discussed in the docstring under "Grouping". Namely, when working with
# //! reasonably big dataframes, it can be confidently expected that for each
# //! group we are interested in there will be at least one screen event assigned
# //! to it. In this case, the following procedure will extract the complete list of groups.
def enumerate_groups(df: pd.DataFrame) -> Tuple[dict, dict]:
groups_list = sorted(list(set(df["group"].tolist())))
group_dict = dict(enumerate(groups_list))
inv_group_dict = dict([(snd, fst) for (fst, snd) in group_dict.items()])
return group_dict, inv_group_dict
# Try to sort and enumerate groups and raise an error if impossible
try:
group_dict, inv_group_dict = enumerate_groups(df_screen)
except TypeError as e:
raise e("Values in the column 'group' must be of a comparable type")
# Iterate over rows of the merged df and then for each row iterate over
# regex mathes. For each match, create a dictionary containing information
# on the matched sequence and append it to the list of rows. Lastly, create
# a new dataframe from the list of rows and return it.
seq_id = 0
def identify_seq(regexp: re.Pattern, label: str) -> list:
"""
Iterates over rows of df_sequences_timestamps_groups, then, for each group
iterates over regex matches. For each regex match, i.e. for each identyfied
sequence, iterates over the groups over which the sequence spans (*cf docstring).
"""
nonlocal seq_id
rows_list = list()
for index, row in df_sequences_timestamps_groups.iterrows():
for match in regexp.finditer(row["screen_status"]):
beginning_index_abs, end_index_abs = match.start(1), match.end(1)
groups = set(row["group"][beginning_index_abs : end_index_abs])
group_ids = {inv_group_dict[grp] for grp in groups}
# TODO Here's part of the problem: the span of relevant groups consists solely
# TODO of those to which at least one screen event is assigned (in the whole df)
span = range(min(group_ids), max(group_ids) + 1)
for grp_id in span:
grp = group_dict[grp_id]
grp_indices = [
index for index
in range(beginning_index_abs, end_index_abs)
if row["group"][index] == grp
]
# TODO Here, we face the converse problem. It may happen that a sequence in fact
# TODO does span over a certain group although none of the events that constitute
# TODO it are assigned to it. In this case, there is no way to calculate the relative
# TODO beginning and end (which should just be the beginning and end of the group).
try:
beginning_index_rel, end_index_rel = min(grp_indices), max(grp_indices)
beginning_rel = row["timestamp"][beginning_index_rel]
end_rel = row["timestamp"][end_index_rel]
except ValueError:
beginning_rel = end_rel = pd.NA
beginning_abs = row["timestamp"][beginning_index_abs]
end_abs = row["timestamp"][end_index_abs - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"seq_id": seq_id,
"sequence_type": label,
"group": grp,
"beginning_abs": beginning_abs,
"end_abs": end_abs,
"duration_abs": end_abs - beginning_abs,
"beginning_rel": beginning_rel,
"end_rel": end_rel,
"duration_rel": end_rel - beginning_rel
}
rows_list.append(new_row_dict)
seq_id += 1
return rows_list
rows_unlock = identify_seq(unlock_pat, "unlock")
rows_check = identify_seq(check_pat, "check")
df_sequences = pd.DataFrame(rows_unlock + rows_check)
return df_sequences
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
# For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
# i.e. how long the screen was unlocked.
# Another example might be the average time between screen unlocks and/or screen status checks.
pass
# def time_screen_sequence(df_screen: pd.DataFrame, groupby: str = "date") -> pd.DataFrame:
# """
# Calculates time statistics related to device usage.
# Parameters
# ----------
# df_screen: pd.DataFrame
# A dataframe containing screen data
# Returns
# -------
# A new dataframe indexed by device_id and participant_id containing the followig collumns:
# - total_usage_time: sum of daily timespans between the last and
# the first event reported by the screen sensor measured in milliseconds
# - real_usage_time: duration of time during which the device was actually in use,
# i.e. the total duration of sequences identified by the function identify_screen_sequence
# - real_usage_time_percentage: real_usage_time / total_usage_time
# - average_time_between_unlocks
# - average_time_between_checks
# - average_check_duration
# - average_unlock_duration
# """
# sequences_df = identify_screen_sequence(df_screen)
# # Calculate the date of the beginning and of the end of a sequence.
# # Drop those sequences which span over several days.
# sequences_df["date_beginning"] = pd.to_datetime(
# sequences_df.beginning, unit="ms").dt.date
# sequences_df["date_end"] = pd.to_datetime(
# sequences_df.end, unit="ms").dt.date
# sequences_df = (
# sequences_df
# [sequences_df["date_beginning"] == sequences_df["date_end"]]
# .drop(columns=["date_end"])
# .rename(columns={"date_beginning":"date"})
# )
# # Calculate the time the device was in use
# usage_time_df = (
# sequences_df.groupby(["sequence_type", "participant_id", "device_id", "date"])
# .agg({"duration":"sum"})
# .apply(lambda x: x//1000, "columns")
# .rename(columns={"duration":"usage_time"})
# )
# # Calculate the average time between sequences
# average_timedelta_df = (
# sequences_df.sort_values("beginning")
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
# .apply(
# lambda grp:
# grp.assign(end_shifted = grp["end"].shift(1))
# )
# .drop(columns=["participant_id", "device_id", "sequence_type", "date"])
# .droplevel(-1)
# .assign(average_timedelta = lambda x: x.beginning - x.end_shifted)
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
# .agg({"average_timedelta": lambda x: np.mean(x)//1000})
# )
# # Calculate the average duration of sequences
# average_duration_df = (
# sequences_df
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
# .agg({"duration": (lambda x: np.mean(x)//1000)})
# .rename(columns={"duration":"average_duration"})
# )
# # Merge into a single dataframe
# merged = pd.merge(
# pd.merge(usage_time_df, average_timedelta_df, left_index=True, right_index=True),
# average_duration_df,
# left_index=True,
# right_index=True
# )
# return merged

File diff suppressed because one or more lines are too long