partially resolved the grouping issue
parent
53df652d02
commit
47ecd4bc02
|
@ -356,9 +356,18 @@ class Proximity(Base, AWAREsensor):
|
|||
|
||||
|
||||
class Screen(Base, AWAREsensor):
|
||||
"""
|
||||
Contains the screen sensor information.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
screen_status: int
|
||||
Screen status (0 – off, 1 – on, 2 – locked, 3 – unlocked)
|
||||
"""
|
||||
screen_status = Column(SmallInteger)
|
||||
|
||||
|
||||
|
||||
class SMS(Base, AWAREsensor):
|
||||
"""
|
||||
Contains the messages sensor information.
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"python.linting.enabled": true,
|
||||
"python.formatting.provider": "autopep8"
|
||||
}
|
|
@ -4,7 +4,7 @@ import pandas as pd
|
|||
import numpy as np
|
||||
import re
|
||||
|
||||
from datetime import *
|
||||
from typing import Tuple, ValuesView
|
||||
|
||||
from config.models import Participant, Screen
|
||||
from setup import db_engine, session
|
||||
|
@ -36,27 +36,67 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
|
|||
return df_screen
|
||||
|
||||
|
||||
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||
def identify_screen_sequence(df_screen: pd.DataFrame, grouping: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Identify interesting sequences (unlock, status check) and return them in a dataframe.
|
||||
Identifes interesting sequences (unlock, status check) and returns them in a dataframe.
|
||||
Transform the grouping of screen events (by day, hour...) into a grouping of sequences.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_screen: pd.DataFrame
|
||||
A dataframe containing screen data
|
||||
A dataframe containing screen data and a column "group".
|
||||
N.B.: the values in the column "group" must be of a comparable type (e.g. int, datetime.date etc.)
|
||||
grouping:
|
||||
A boolean value indicating whether the input df contains the columng "group".
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_sequences: pd.DataFrame
|
||||
df_sequences: pd.DataFrame:
|
||||
A dataframe containing information on screen sequences
|
||||
|
||||
Columns:
|
||||
* participant_id
|
||||
* device_id
|
||||
* sequence_type: unlock/check
|
||||
* beginning: beginning of unlock/check in miliseconds since 1970
|
||||
* end: end of unlock/check in miliseconds since 1970
|
||||
* duration
|
||||
- participant_id
|
||||
- device_id
|
||||
- seq_id: an unique id assigned to each sequence
|
||||
- sequence_type: unlock/check
|
||||
- group: the group to which the sequence belongs, i.e. the timespan during which it has
|
||||
occured. Note that in the case that it spans over a longer period of time,
|
||||
the same sequence is assigned to multiple groups
|
||||
- beginning_abs: beginning of unlock/check [ms since 1970]
|
||||
- end_abs: end of unlock/check in [ms since 1970]
|
||||
- duration_abs [ms]
|
||||
- beginning_rel: beginning of a sequence relative to the group [ms since 1970]
|
||||
- end_rel [ms since 1970]
|
||||
- duration_rel [ms since 1970]
|
||||
|
||||
Legend
|
||||
------
|
||||
- 0: off
|
||||
- 1: on
|
||||
- 2: locked
|
||||
- 3: unlocked
|
||||
|
||||
Grouping
|
||||
--------
|
||||
If the screen events of the input df are assigned a time structure, the identified sequences should also
|
||||
be. If all of the screen events constituing a sequence are in the same group, assingning the sequence to
|
||||
a group is trivial - it should be the group the events belong to. If, on the other hand, the situation is
|
||||
trickier. As of the moment, the procedure is implemented as follows:
|
||||
The relative beginning (relative to a certain group, i.e. timespan) is defined as the timestamp of the
|
||||
first event belonging to the group in question. Relative end and relative duration are defined in a
|
||||
similar fashion. This is, however, not optimal. We would namely wish, e.g., that the relative durations
|
||||
of a given sequence would sum up to its absolute duration which is not yet the case.
|
||||
TODO In order to achieve this, we would need to be given more information on the groups. The current
|
||||
TODO constraint on the groups is only that they be comparable. This is insufficient since it is
|
||||
TODO impossible to infer:
|
||||
TODO - how many and which groups lie between two given groups
|
||||
TODO - when in time does a certain group begin and when does it end
|
||||
TODO In order to mitigate these issues, we would need to be given a complete list of groups and the
|
||||
TODO groups should have the form of an interval (beginning, end).
|
||||
|
||||
In fact, under the presupposition that we will always be working with relatively big dataframes,
|
||||
the requirement of being given a complete list of groups becomes unnecessary.
|
||||
cf* the highlighted comment below.
|
||||
|
||||
Heuristics
|
||||
----------
|
||||
|
@ -81,57 +121,42 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
|||
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
||||
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
||||
Since the time interval between the last two events of this sequence is commonly very
|
||||
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
|
||||
sequence.
|
||||
short (around 30ms), the 2 at the end should be interpreted as part of the SCREEN-CHECK
|
||||
SEQUENCE.
|
||||
ii) (2|0)130102
|
||||
This sequence is interpreted as a nested screen-check sequence (010) inside
|
||||
a unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
|
||||
an unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
|
||||
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
|
||||
proper check sequence and we therefore interpret the whole sequence as an unlock sequence.
|
||||
|
||||
TODO: the function time_screen_sequence returns some weird values. For example, the average check time of
|
||||
participant nr. 74 is several minutes and the real usage time percentage of participant nr. 78 is about 50%.
|
||||
proper check sequence and we therefore interpret the whole sequence as an UNLOCK SEQUENCE.
|
||||
"""
|
||||
|
||||
# If the time structure of sequences is not of interest, all events should be assigned to the same group
|
||||
if not grouping:
|
||||
df_screen["group"] = 0
|
||||
|
||||
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
||||
|
||||
groups = df_screen.groupby("device_id")
|
||||
|
||||
# Create a df containing, for each device, a string representing the sequence of
|
||||
# screen events in chronological order, e.g. "01301302130202130..."
|
||||
|
||||
df_screen_sequences = (
|
||||
groups["screen_status"]
|
||||
.apply(list)
|
||||
.apply(lambda list_: "".join([str(x) for x in list_]))
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Create a df containing, for each device, a list of timestamps of screen events
|
||||
# Create a df containing for each device a row with the following columns:
|
||||
# - participant_id: the id of the participant the device belongs to
|
||||
# - screen_status:
|
||||
# a string representing the sequence of screen events
|
||||
# in chronological order, e.g. "01301302130202130..."
|
||||
# - timestamp:
|
||||
# a list of timestamps of screen events
|
||||
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
||||
# - group:
|
||||
# a list of groups to which the screen events
|
||||
# belong, again, in cronological order
|
||||
|
||||
df_timestamps = (
|
||||
groups["timestamp"]
|
||||
.apply(list)
|
||||
.to_frame()
|
||||
df_sequences_timestamps_groups = (
|
||||
df_screen.groupby(["device_id", "participant_id"])
|
||||
.agg({
|
||||
"screen_status": lambda list_: "".join([str(x) for x in list_]),
|
||||
"timestamp": list,
|
||||
"group": list})
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Create a df containing information to which participant the devices belong
|
||||
|
||||
df_participants = (
|
||||
df_screen[["device_id", "participant_id"]]
|
||||
.drop_duplicates()
|
||||
.reset_index()
|
||||
.drop("index", 1)
|
||||
)
|
||||
|
||||
df_merged = (
|
||||
df_screen_sequences.merge(df_timestamps, on="device_id")
|
||||
.merge(df_participants, on="device_id")
|
||||
)
|
||||
|
||||
# Regex patterns implementing the heuristics described in the docstring.
|
||||
# Since the matching sequences can overlap, lookahead is used. Note that
|
||||
# the first event in a sequence isn't part of the group caught inside the
|
||||
|
@ -139,142 +164,192 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
|||
# event of the previous sequence, so that the time interval between the first
|
||||
# and the second event in a sequence is actually the time the device is not in use.
|
||||
|
||||
unlock_pat = re.compile(r"(?=[0,2]((13|31)(0+|010)2))")
|
||||
check_pat = re.compile(r"(?=[0,2](10+))")
|
||||
unlock_pat = re.compile(
|
||||
# Begin the lookahead group. Inside the lookahead group,
|
||||
# first match either a 0 or a 2
|
||||
r"(?=[0,2]"
|
||||
# Begin the 1st capturing group, this is the one we are interested in.
|
||||
# Match either a 13 or a 31.
|
||||
r"((13|31)"
|
||||
# Match either a (nonzero) sequence of consecutive 0s or a 010. Than, match
|
||||
# a two. End the 1st capturing group. End the lookahead group.
|
||||
r"(0+|010)2))"
|
||||
)
|
||||
check_pat = re.compile(
|
||||
# Begin the lookahead group. Inside the lookahead group,
|
||||
# first match either a 0 or a 2
|
||||
r"(?=[0,2]"
|
||||
# Begin the 1st capturing group. Capture a 1 succeeded by several 0s.
|
||||
# End the 1st captouring group. End the lookahead group.
|
||||
r"(10+))"
|
||||
)
|
||||
|
||||
# Enumerate the groups based on increasing order in order to make iteration easier.
|
||||
# //! N.B.: this is also a possible way to ease the constraint on the groups
|
||||
# //! discussed in the docstring under "Grouping". Namely, when working with
|
||||
# //! reasonably big dataframes, it can be confidently expected that for each
|
||||
# //! group we are interested in there will be at least one screen event assigned
|
||||
# //! to it. In this case, the following procedure will extract the complete list of groups.
|
||||
|
||||
def enumerate_groups(df: pd.DataFrame) -> Tuple[dict, dict]:
|
||||
groups_list = sorted(list(set(df["group"].tolist())))
|
||||
group_dict = dict(enumerate(groups_list))
|
||||
inv_group_dict = dict([(snd, fst) for (fst, snd) in group_dict.items()])
|
||||
return group_dict, inv_group_dict
|
||||
|
||||
# Try to sort and enumerate groups and raise an error if impossible
|
||||
|
||||
try:
|
||||
group_dict, inv_group_dict = enumerate_groups(df_screen)
|
||||
except TypeError as e:
|
||||
raise e("Values in the column 'group' must be of a comparable type")
|
||||
|
||||
# Iterate over rows of the merged df and then for each row iterate over
|
||||
# regex mathes. For each match, create a dictionary containing information
|
||||
# on the matched sequence and append it to the list of rows. Lastly, create
|
||||
# a new dataframe from the list of rows and return it.
|
||||
|
||||
seq_id = 0
|
||||
|
||||
def identify_seq(regexp: re.Pattern, label: str) -> list:
|
||||
"""
|
||||
Iterates over rows of df_sequences_timestamps_groups, then, for each group
|
||||
iterates over regex matches. For each regex match, i.e. for each identyfied
|
||||
sequence, iterates over the groups over which the sequence spans (*cf docstring).
|
||||
"""
|
||||
nonlocal seq_id
|
||||
rows_list = list()
|
||||
for index, row in df_merged.iterrows():
|
||||
for match in unlock_pat.finditer(row["screen_status"]):
|
||||
beginning = row["timestamp"][match.start(1)]
|
||||
end = row["timestamp"][match.end(1) - 1]
|
||||
for index, row in df_sequences_timestamps_groups.iterrows():
|
||||
for match in regexp.finditer(row["screen_status"]):
|
||||
beginning_index_abs, end_index_abs = match.start(1), match.end(1)
|
||||
groups = set(row["group"][beginning_index_abs : end_index_abs])
|
||||
group_ids = {inv_group_dict[grp] for grp in groups}
|
||||
# TODO Here's part of the problem: the span of relevant groups consists solely
|
||||
# TODO of those to which at least one screen event is assigned (in the whole df)
|
||||
span = range(min(group_ids), max(group_ids) + 1)
|
||||
for grp_id in span:
|
||||
grp = group_dict[grp_id]
|
||||
grp_indices = [
|
||||
index for index
|
||||
in range(beginning_index_abs, end_index_abs)
|
||||
if row["group"][index] == grp
|
||||
]
|
||||
# TODO Here, we face the converse problem. It may happen that a sequence in fact
|
||||
# TODO does span over a certain group although none of the events that constitute
|
||||
# TODO it are assigned to it. In this case, there is no way to calculate the relative
|
||||
# TODO beginning and end (which should just be the beginning and end of the group).
|
||||
try:
|
||||
beginning_index_rel, end_index_rel = min(grp_indices), max(grp_indices)
|
||||
beginning_rel = row["timestamp"][beginning_index_rel]
|
||||
end_rel = row["timestamp"][end_index_rel]
|
||||
except ValueError:
|
||||
beginning_rel = end_rel = pd.NA
|
||||
beginning_abs = row["timestamp"][beginning_index_abs]
|
||||
end_abs = row["timestamp"][end_index_abs - 1]
|
||||
new_row_dict = {
|
||||
"participant_id": row["participant_id"],
|
||||
"device_id": row["device_id"],
|
||||
"sequence_type": "unlock",
|
||||
"beginning": beginning,
|
||||
"end": end
|
||||
"seq_id": seq_id,
|
||||
"sequence_type": label,
|
||||
"group": grp,
|
||||
"beginning_abs": beginning_abs,
|
||||
"end_abs": end_abs,
|
||||
"duration_abs": end_abs - beginning_abs,
|
||||
"beginning_rel": beginning_rel,
|
||||
"end_rel": end_rel,
|
||||
"duration_rel": end_rel - beginning_rel
|
||||
}
|
||||
rows_list.append(new_row_dict)
|
||||
for match in check_pat.finditer(row["screen_status"]):
|
||||
beginning = row["timestamp"][match.start(1)]
|
||||
end = row["timestamp"][match.end(1) - 1]
|
||||
new_row_dict = {
|
||||
"participant_id": row["participant_id"],
|
||||
"device_id": row["device_id"],
|
||||
"sequence_type": "check",
|
||||
"beginning": beginning,
|
||||
"end": end
|
||||
}
|
||||
rows_list.append(new_row_dict)
|
||||
df_sequences = pd.DataFrame(rows_list)
|
||||
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
|
||||
seq_id += 1
|
||||
return rows_list
|
||||
|
||||
rows_unlock = identify_seq(unlock_pat, "unlock")
|
||||
rows_check = identify_seq(check_pat, "check")
|
||||
|
||||
df_sequences = pd.DataFrame(rows_unlock + rows_check)
|
||||
|
||||
return df_sequences
|
||||
|
||||
|
||||
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates time statistics related to device usage.
|
||||
# def time_screen_sequence(df_screen: pd.DataFrame, groupby: str = "date") -> pd.DataFrame:
|
||||
# """
|
||||
# Calculates time statistics related to device usage.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_screen: pd.DataFrame
|
||||
A dataframe containing screen data
|
||||
# Parameters
|
||||
# ----------
|
||||
# df_screen: pd.DataFrame
|
||||
# A dataframe containing screen data
|
||||
|
||||
Returns
|
||||
-------
|
||||
A new dataframe indexed by device_id and participant_id containing the followig collumns:
|
||||
* total_usage_time: sum of daily timespans between the last and
|
||||
the first event reported by the screen sensor measured in milliseconds
|
||||
* real_usage_time: duration of time during which the device was actually in use,
|
||||
i.e. the total duration of sequences identified by the function identify_screen_sequence
|
||||
* real_usage_time_percentage: real_usage_time / total_usage_time
|
||||
* average_time_between_unlocks
|
||||
* average_time_between_checks
|
||||
* average_check_duration
|
||||
* average_unlock_duration
|
||||
"""
|
||||
# Returns
|
||||
# -------
|
||||
# A new dataframe indexed by device_id and participant_id containing the followig collumns:
|
||||
# - total_usage_time: sum of daily timespans between the last and
|
||||
# the first event reported by the screen sensor measured in milliseconds
|
||||
# - real_usage_time: duration of time during which the device was actually in use,
|
||||
# i.e. the total duration of sequences identified by the function identify_screen_sequence
|
||||
# - real_usage_time_percentage: real_usage_time / total_usage_time
|
||||
# - average_time_between_unlocks
|
||||
# - average_time_between_checks
|
||||
# - average_check_duration
|
||||
# - average_unlock_duration
|
||||
# """
|
||||
|
||||
sequences_df = identify_screen_sequence(df_screen)
|
||||
# sequences_df = identify_screen_sequence(df_screen)
|
||||
|
||||
# Fit timestamps to dates
|
||||
sequences_df["date"] = pd.to_datetime(sequences_df.beginning, unit="ms").dt.date
|
||||
|
||||
# Calculate total_usage_time, real_usage_time and real_usage_time_percentage
|
||||
usage_time_df = (
|
||||
sequences_df.groupby(["device_id","participant_id", "date"])
|
||||
[["beginning", "end", "duration"]]
|
||||
.agg({"beginning":"min", "end":"max", "duration":"sum"})
|
||||
.assign(
|
||||
total_usage_time=lambda x: x.end - x.beginning
|
||||
)
|
||||
.drop(["beginning", "end"], axis=1)
|
||||
.rename(columns={"duration":"real_usage_time"})
|
||||
.groupby(["device_id", "participant_id"])
|
||||
.agg({"real_usage_time":"sum","total_usage_time":"sum"})
|
||||
.assign(
|
||||
real_usage_time_percentage=lambda x: x.real_usage_time / x.total_usage_time
|
||||
)
|
||||
)
|
||||
|
||||
# Calculate time_between_unlocks
|
||||
time_between_unlocks_df = (
|
||||
sequences_df[sequences_df["sequence_type"] == "unlock"]
|
||||
.sort_values(["participant_id", "device_id", "beginning"])
|
||||
)
|
||||
time_between_unlocks_df = (
|
||||
time_between_unlocks_df
|
||||
.assign(end_ = time_between_unlocks_df.groupby("device_id")["end"].shift(1))
|
||||
.assign(time_between_unlocks=lambda x: x.beginning - x.end_)
|
||||
.groupby(["device_id", "participant_id"])
|
||||
.agg({"time_between_unlocks":"mean"})
|
||||
.rename(columns={"time_between_unlocks":"average_time_between_unlocks"})
|
||||
)
|
||||
# # Calculate the date of the beginning and of the end of a sequence.
|
||||
# # Drop those sequences which span over several days.
|
||||
# sequences_df["date_beginning"] = pd.to_datetime(
|
||||
# sequences_df.beginning, unit="ms").dt.date
|
||||
|
||||
# Calculate time_between_checks
|
||||
time_between_checks_df = (
|
||||
sequences_df[sequences_df["sequence_type"] == "check"]
|
||||
.sort_values(["participant_id", "device_id", "beginning"])
|
||||
)
|
||||
time_between_checks_df = (
|
||||
time_between_checks_df
|
||||
.assign(end_ = time_between_checks_df.groupby("device_id")["end"].shift(1))
|
||||
.assign(time_between_checks=lambda x: x.beginning - x.end_)
|
||||
.groupby(["device_id", "participant_id"])
|
||||
.agg({"time_between_checks":"mean"})
|
||||
.rename(columns={"time_between_checks":"average_time_between_checks"})
|
||||
)
|
||||
# sequences_df["date_end"] = pd.to_datetime(
|
||||
# sequences_df.end, unit="ms").dt.date
|
||||
|
||||
# Calculate average_check_time and average_unlock_time
|
||||
average_duration_df = (
|
||||
sequences_df
|
||||
.groupby(["device_id", "participant_id", "sequence_type"])
|
||||
.agg(
|
||||
{"duration": (lambda x: int(np.mean(x)))}
|
||||
)
|
||||
.unstack()
|
||||
)
|
||||
# sequences_df = (
|
||||
# sequences_df
|
||||
# [sequences_df["date_beginning"] == sequences_df["date_end"]]
|
||||
# .drop(columns=["date_end"])
|
||||
# .rename(columns={"date_beginning":"date"})
|
||||
# )
|
||||
|
||||
# Merge the four newely created dataframes
|
||||
merged = usage_time_df.merge(
|
||||
time_between_unlocks_df,
|
||||
on=["device_id", "participant_id"]
|
||||
).merge(
|
||||
time_between_checks_df,
|
||||
on=["device_id", "participant_id"]
|
||||
).merge(
|
||||
average_duration_df,
|
||||
on=["device_id", "participant_id"]
|
||||
).rename(
|
||||
columns={
|
||||
("duration","unlock"):"average_unlock_duration",
|
||||
("duration","check"):"average_check_duration"
|
||||
}
|
||||
)
|
||||
# # Calculate the time the device was in use
|
||||
# usage_time_df = (
|
||||
# sequences_df.groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||
# .agg({"duration":"sum"})
|
||||
# .apply(lambda x: x//1000, "columns")
|
||||
# .rename(columns={"duration":"usage_time"})
|
||||
# )
|
||||
|
||||
return merged
|
||||
# # Calculate the average time between sequences
|
||||
# average_timedelta_df = (
|
||||
# sequences_df.sort_values("beginning")
|
||||
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||
# .apply(
|
||||
# lambda grp:
|
||||
# grp.assign(end_shifted = grp["end"].shift(1))
|
||||
# )
|
||||
# .drop(columns=["participant_id", "device_id", "sequence_type", "date"])
|
||||
# .droplevel(-1)
|
||||
# .assign(average_timedelta = lambda x: x.beginning - x.end_shifted)
|
||||
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||
# .agg({"average_timedelta": lambda x: np.mean(x)//1000})
|
||||
# )
|
||||
|
||||
# # Calculate the average duration of sequences
|
||||
# average_duration_df = (
|
||||
# sequences_df
|
||||
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||
# .agg({"duration": (lambda x: np.mean(x)//1000)})
|
||||
# .rename(columns={"duration":"average_duration"})
|
||||
# )
|
||||
|
||||
# # Merge into a single dataframe
|
||||
# merged = pd.merge(
|
||||
# pd.merge(usage_time_df, average_timedelta_df, left_index=True, right_index=True),
|
||||
# average_duration_df,
|
||||
# left_index=True,
|
||||
# right_index=True
|
||||
# )
|
||||
|
||||
# return merged
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue