partially resolved the grouping issue

screen_sequences
Ivan Kobe 2021-09-22 16:34:33 +02:00
parent 53df652d02
commit 47ecd4bc02
4 changed files with 1112 additions and 761 deletions

View File

@ -356,7 +356,16 @@ class Proximity(Base, AWAREsensor):
class Screen(Base, AWAREsensor):
"""
Contains the screen sensor information.
Attributes
----------
screen_status: int
Screen status (0 off, 1 on, 2 locked, 3 unlocked)
"""
screen_status = Column(SmallInteger)
class SMS(Base, AWAREsensor):

4
features/.vscode/settings.json vendored 100644
View File

@ -0,0 +1,4 @@
{
"python.linting.enabled": true,
"python.formatting.provider": "autopep8"
}

View File

@ -4,7 +4,7 @@ import pandas as pd
import numpy as np
import re
from datetime import *
from typing import Tuple, ValuesView
from config.models import Participant, Screen
from setup import db_engine, session
@ -36,27 +36,67 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
return df_screen
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
def identify_screen_sequence(df_screen: pd.DataFrame, grouping: bool = False) -> pd.DataFrame:
"""
Identify interesting sequences (unlock, status check) and return them in a dataframe.
Identifes interesting sequences (unlock, status check) and returns them in a dataframe.
Transform the grouping of screen events (by day, hour...) into a grouping of sequences.
Parameters
----------
df_screen: pd.DataFrame
A dataframe containing screen data
A dataframe containing screen data and a column "group".
N.B.: the values in the column "group" must be of a comparable type (e.g. int, datetime.date etc.)
grouping:
A boolean value indicating whether the input df contains the columng "group".
Returns
-------
df_sequences: pd.DataFrame
df_sequences: pd.DataFrame:
A dataframe containing information on screen sequences
Columns:
* participant_id
* device_id
* sequence_type: unlock/check
* beginning: beginning of unlock/check in miliseconds since 1970
* end: end of unlock/check in miliseconds since 1970
* duration
- participant_id
- device_id
- seq_id: an unique id assigned to each sequence
- sequence_type: unlock/check
- group: the group to which the sequence belongs, i.e. the timespan during which it has
occured. Note that in the case that it spans over a longer period of time,
the same sequence is assigned to multiple groups
- beginning_abs: beginning of unlock/check [ms since 1970]
- end_abs: end of unlock/check in [ms since 1970]
- duration_abs [ms]
- beginning_rel: beginning of a sequence relative to the group [ms since 1970]
- end_rel [ms since 1970]
- duration_rel [ms since 1970]
Legend
------
- 0: off
- 1: on
- 2: locked
- 3: unlocked
Grouping
--------
If the screen events of the input df are assigned a time structure, the identified sequences should also
be. If all of the screen events constituing a sequence are in the same group, assingning the sequence to
a group is trivial - it should be the group the events belong to. If, on the other hand, the situation is
trickier. As of the moment, the procedure is implemented as follows:
The relative beginning (relative to a certain group, i.e. timespan) is defined as the timestamp of the
first event belonging to the group in question. Relative end and relative duration are defined in a
similar fashion. This is, however, not optimal. We would namely wish, e.g., that the relative durations
of a given sequence would sum up to its absolute duration which is not yet the case.
TODO In order to achieve this, we would need to be given more information on the groups. The current
TODO constraint on the groups is only that they be comparable. This is insufficient since it is
TODO impossible to infer:
TODO - how many and which groups lie between two given groups
TODO - when in time does a certain group begin and when does it end
TODO In order to mitigate these issues, we would need to be given a complete list of groups and the
TODO groups should have the form of an interval (beginning, end).
In fact, under the presupposition that we will always be working with relatively big dataframes,
the requirement of being given a complete list of groups becomes unnecessary.
cf* the highlighted comment below.
Heuristics
----------
@ -81,57 +121,42 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
is an inconsistency, however due to its frequency it has to be dealt with in some way.
Since the time interval between the last two events of this sequence is commonly very
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
sequence.
short (around 30ms), the 2 at the end should be interpreted as part of the SCREEN-CHECK
SEQUENCE.
ii) (2|0)130102
This sequence is interpreted as a nested screen-check sequence (010) inside
a unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
an unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
proper check sequence and we therefore interpret the whole sequence as an unlock sequence.
TODO: the function time_screen_sequence returns some weird values. For example, the average check time of
participant nr. 74 is several minutes and the real usage time percentage of participant nr. 78 is about 50%.
proper check sequence and we therefore interpret the whole sequence as an UNLOCK SEQUENCE.
"""
# If the time structure of sequences is not of interest, all events should be assigned to the same group
if not grouping:
df_screen["group"] = 0
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
groups = df_screen.groupby("device_id")
# Create a df containing for each device a row with the following columns:
# - participant_id: the id of the participant the device belongs to
# - screen_status:
# a string representing the sequence of screen events
# in chronological order, e.g. "01301302130202130..."
# - timestamp:
# a list of timestamps of screen events
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
# - group:
# a list of groups to which the screen events
# belong, again, in cronological order
# Create a df containing, for each device, a string representing the sequence of
# screen events in chronological order, e.g. "01301302130202130..."
df_screen_sequences = (
groups["screen_status"]
.apply(list)
.apply(lambda list_: "".join([str(x) for x in list_]))
.to_frame()
df_sequences_timestamps_groups = (
df_screen.groupby(["device_id", "participant_id"])
.agg({
"screen_status": lambda list_: "".join([str(x) for x in list_]),
"timestamp": list,
"group": list})
.reset_index()
)
# Create a df containing, for each device, a list of timestamps of screen events
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
df_timestamps = (
groups["timestamp"]
.apply(list)
.to_frame()
.reset_index()
)
# Create a df containing information to which participant the devices belong
df_participants = (
df_screen[["device_id", "participant_id"]]
.drop_duplicates()
.reset_index()
.drop("index", 1)
)
df_merged = (
df_screen_sequences.merge(df_timestamps, on="device_id")
.merge(df_participants, on="device_id")
)
# Regex patterns implementing the heuristics described in the docstring.
# Since the matching sequences can overlap, lookahead is used. Note that
# the first event in a sequence isn't part of the group caught inside the
@ -139,142 +164,192 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# event of the previous sequence, so that the time interval between the first
# and the second event in a sequence is actually the time the device is not in use.
unlock_pat = re.compile(r"(?=[0,2]((13|31)(0+|010)2))")
check_pat = re.compile(r"(?=[0,2](10+))")
unlock_pat = re.compile(
# Begin the lookahead group. Inside the lookahead group,
# first match either a 0 or a 2
r"(?=[0,2]"
# Begin the 1st capturing group, this is the one we are interested in.
# Match either a 13 or a 31.
r"((13|31)"
# Match either a (nonzero) sequence of consecutive 0s or a 010. Than, match
# a two. End the 1st capturing group. End the lookahead group.
r"(0+|010)2))"
)
check_pat = re.compile(
# Begin the lookahead group. Inside the lookahead group,
# first match either a 0 or a 2
r"(?=[0,2]"
# Begin the 1st capturing group. Capture a 1 succeeded by several 0s.
# End the 1st captouring group. End the lookahead group.
r"(10+))"
)
# Enumerate the groups based on increasing order in order to make iteration easier.
# //! N.B.: this is also a possible way to ease the constraint on the groups
# //! discussed in the docstring under "Grouping". Namely, when working with
# //! reasonably big dataframes, it can be confidently expected that for each
# //! group we are interested in there will be at least one screen event assigned
# //! to it. In this case, the following procedure will extract the complete list of groups.
def enumerate_groups(df: pd.DataFrame) -> Tuple[dict, dict]:
groups_list = sorted(list(set(df["group"].tolist())))
group_dict = dict(enumerate(groups_list))
inv_group_dict = dict([(snd, fst) for (fst, snd) in group_dict.items()])
return group_dict, inv_group_dict
# Try to sort and enumerate groups and raise an error if impossible
try:
group_dict, inv_group_dict = enumerate_groups(df_screen)
except TypeError as e:
raise e("Values in the column 'group' must be of a comparable type")
# Iterate over rows of the merged df and then for each row iterate over
# regex mathes. For each match, create a dictionary containing information
# on the matched sequence and append it to the list of rows. Lastly, create
# a new dataframe from the list of rows and return it.
seq_id = 0
def identify_seq(regexp: re.Pattern, label: str) -> list:
"""
Iterates over rows of df_sequences_timestamps_groups, then, for each group
iterates over regex matches. For each regex match, i.e. for each identyfied
sequence, iterates over the groups over which the sequence spans (*cf docstring).
"""
nonlocal seq_id
rows_list = list()
for index, row in df_sequences_timestamps_groups.iterrows():
for match in regexp.finditer(row["screen_status"]):
beginning_index_abs, end_index_abs = match.start(1), match.end(1)
groups = set(row["group"][beginning_index_abs : end_index_abs])
group_ids = {inv_group_dict[grp] for grp in groups}
# TODO Here's part of the problem: the span of relevant groups consists solely
# TODO of those to which at least one screen event is assigned (in the whole df)
span = range(min(group_ids), max(group_ids) + 1)
for grp_id in span:
grp = group_dict[grp_id]
grp_indices = [
index for index
in range(beginning_index_abs, end_index_abs)
if row["group"][index] == grp
]
# TODO Here, we face the converse problem. It may happen that a sequence in fact
# TODO does span over a certain group although none of the events that constitute
# TODO it are assigned to it. In this case, there is no way to calculate the relative
# TODO beginning and end (which should just be the beginning and end of the group).
try:
beginning_index_rel, end_index_rel = min(grp_indices), max(grp_indices)
beginning_rel = row["timestamp"][beginning_index_rel]
end_rel = row["timestamp"][end_index_rel]
except ValueError:
beginning_rel = end_rel = pd.NA
beginning_abs = row["timestamp"][beginning_index_abs]
end_abs = row["timestamp"][end_index_abs - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"seq_id": seq_id,
"sequence_type": label,
"group": grp,
"beginning_abs": beginning_abs,
"end_abs": end_abs,
"duration_abs": end_abs - beginning_abs,
"beginning_rel": beginning_rel,
"end_rel": end_rel,
"duration_rel": end_rel - beginning_rel
}
rows_list.append(new_row_dict)
seq_id += 1
return rows_list
rows_list = list()
for index, row in df_merged.iterrows():
for match in unlock_pat.finditer(row["screen_status"]):
beginning = row["timestamp"][match.start(1)]
end = row["timestamp"][match.end(1) - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"sequence_type": "unlock",
"beginning": beginning,
"end": end
}
rows_list.append(new_row_dict)
for match in check_pat.finditer(row["screen_status"]):
beginning = row["timestamp"][match.start(1)]
end = row["timestamp"][match.end(1) - 1]
new_row_dict = {
"participant_id": row["participant_id"],
"device_id": row["device_id"],
"sequence_type": "check",
"beginning": beginning,
"end": end
}
rows_list.append(new_row_dict)
df_sequences = pd.DataFrame(rows_list)
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
rows_unlock = identify_seq(unlock_pat, "unlock")
rows_check = identify_seq(check_pat, "check")
df_sequences = pd.DataFrame(rows_unlock + rows_check)
return df_sequences
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
"""
Calculates time statistics related to device usage.
# def time_screen_sequence(df_screen: pd.DataFrame, groupby: str = "date") -> pd.DataFrame:
# """
# Calculates time statistics related to device usage.
Parameters
----------
df_screen: pd.DataFrame
A dataframe containing screen data
Returns
-------
A new dataframe indexed by device_id and participant_id containing the followig collumns:
* total_usage_time: sum of daily timespans between the last and
the first event reported by the screen sensor measured in milliseconds
* real_usage_time: duration of time during which the device was actually in use,
i.e. the total duration of sequences identified by the function identify_screen_sequence
* real_usage_time_percentage: real_usage_time / total_usage_time
* average_time_between_unlocks
* average_time_between_checks
* average_check_duration
* average_unlock_duration
"""
# Parameters
# ----------
# df_screen: pd.DataFrame
# A dataframe containing screen data
sequences_df = identify_screen_sequence(df_screen)
# Returns
# -------
# A new dataframe indexed by device_id and participant_id containing the followig collumns:
# - total_usage_time: sum of daily timespans between the last and
# the first event reported by the screen sensor measured in milliseconds
# - real_usage_time: duration of time during which the device was actually in use,
# i.e. the total duration of sequences identified by the function identify_screen_sequence
# - real_usage_time_percentage: real_usage_time / total_usage_time
# - average_time_between_unlocks
# - average_time_between_checks
# - average_check_duration
# - average_unlock_duration
# """
# Fit timestamps to dates
sequences_df["date"] = pd.to_datetime(sequences_df.beginning, unit="ms").dt.date
# sequences_df = identify_screen_sequence(df_screen)
# Calculate total_usage_time, real_usage_time and real_usage_time_percentage
usage_time_df = (
sequences_df.groupby(["device_id","participant_id", "date"])
[["beginning", "end", "duration"]]
.agg({"beginning":"min", "end":"max", "duration":"sum"})
.assign(
total_usage_time=lambda x: x.end - x.beginning
)
.drop(["beginning", "end"], axis=1)
.rename(columns={"duration":"real_usage_time"})
.groupby(["device_id", "participant_id"])
.agg({"real_usage_time":"sum","total_usage_time":"sum"})
.assign(
real_usage_time_percentage=lambda x: x.real_usage_time / x.total_usage_time
)
)
# Calculate time_between_unlocks
time_between_unlocks_df = (
sequences_df[sequences_df["sequence_type"] == "unlock"]
.sort_values(["participant_id", "device_id", "beginning"])
)
time_between_unlocks_df = (
time_between_unlocks_df
.assign(end_ = time_between_unlocks_df.groupby("device_id")["end"].shift(1))
.assign(time_between_unlocks=lambda x: x.beginning - x.end_)
.groupby(["device_id", "participant_id"])
.agg({"time_between_unlocks":"mean"})
.rename(columns={"time_between_unlocks":"average_time_between_unlocks"})
)
# Calculate time_between_checks
time_between_checks_df = (
sequences_df[sequences_df["sequence_type"] == "check"]
.sort_values(["participant_id", "device_id", "beginning"])
)
time_between_checks_df = (
time_between_checks_df
.assign(end_ = time_between_checks_df.groupby("device_id")["end"].shift(1))
.assign(time_between_checks=lambda x: x.beginning - x.end_)
.groupby(["device_id", "participant_id"])
.agg({"time_between_checks":"mean"})
.rename(columns={"time_between_checks":"average_time_between_checks"})
)
# # Calculate the date of the beginning and of the end of a sequence.
# # Drop those sequences which span over several days.
# sequences_df["date_beginning"] = pd.to_datetime(
# sequences_df.beginning, unit="ms").dt.date
# Calculate average_check_time and average_unlock_time
average_duration_df = (
sequences_df
.groupby(["device_id", "participant_id", "sequence_type"])
.agg(
{"duration": (lambda x: int(np.mean(x)))}
)
.unstack()
)
# Merge the four newely created dataframes
merged = usage_time_df.merge(
time_between_unlocks_df,
on=["device_id", "participant_id"]
).merge(
time_between_checks_df,
on=["device_id", "participant_id"]
).merge(
average_duration_df,
on=["device_id", "participant_id"]
).rename(
columns={
("duration","unlock"):"average_unlock_duration",
("duration","check"):"average_check_duration"
}
)
# sequences_df["date_end"] = pd.to_datetime(
# sequences_df.end, unit="ms").dt.date
return merged
# sequences_df = (
# sequences_df
# [sequences_df["date_beginning"] == sequences_df["date_end"]]
# .drop(columns=["date_end"])
# .rename(columns={"date_beginning":"date"})
# )
# # Calculate the time the device was in use
# usage_time_df = (
# sequences_df.groupby(["sequence_type", "participant_id", "device_id", "date"])
# .agg({"duration":"sum"})
# .apply(lambda x: x//1000, "columns")
# .rename(columns={"duration":"usage_time"})
# )
# # Calculate the average time between sequences
# average_timedelta_df = (
# sequences_df.sort_values("beginning")
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
# .apply(
# lambda grp:
# grp.assign(end_shifted = grp["end"].shift(1))
# )
# .drop(columns=["participant_id", "device_id", "sequence_type", "date"])
# .droplevel(-1)
# .assign(average_timedelta = lambda x: x.beginning - x.end_shifted)
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
# .agg({"average_timedelta": lambda x: np.mean(x)//1000})
# )
# # Calculate the average duration of sequences
# average_duration_df = (
# sequences_df
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
# .agg({"duration": (lambda x: np.mean(x)//1000)})
# .rename(columns={"duration":"average_duration"})
# )
# # Merge into a single dataframe
# merged = pd.merge(
# pd.merge(usage_time_df, average_timedelta_df, left_index=True, right_index=True),
# average_duration_df,
# left_index=True,
# right_index=True
# )
# return merged

File diff suppressed because one or more lines are too long