implemented time_screen_sequence

screen_sequences
Ivan Kobe 2021-09-13 11:47:19 +02:00
parent cf7e692927
commit 53df652d02
3 changed files with 1213 additions and 692 deletions

View File

@ -1,8 +1,11 @@
from collections.abc import Collection
import pandas as pd
import numpy as np
import re
from datetime import *
from config.models import Participant, Screen
from setup import db_engine, session
@ -82,7 +85,12 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
sequence.
ii) (2|0)130102
This sequence is interpreted as a nested screen-check sequence (010) inside
a unlock sequence ((2|0)1302).
a unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
proper check sequence and we therefore interpret the whole sequence as an unlock sequence.
TODO: the function time_screen_sequence returns some weird values. For example, the average check time of
participant nr. 74 is several minutes and the real usage time percentage of participant nr. 78 is about 50%.
"""
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
@ -131,8 +139,8 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# event of the previous sequence, so that the time interval between the first
# and the second event in a sequence is actually the time the device is not in use.
unlock_pat = re.compile(r"(?=[0,2]((13|31)0+2))")
check_pat = re.compile(r"(?=[0,2](10+2?))")
unlock_pat = re.compile(r"(?=[0,2]((13|31)(0+|010)2))")
check_pat = re.compile(r"(?=[0,2](10+))")
# Iterate over rows of the merged df and then for each row iterate over
# regex mathes. For each match, create a dictionary containing information
@ -170,8 +178,103 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
# TODO Use the results of indentify_screen_sequence to calculate time statistics related to transitions.
# For example, from the two main sequences outlined above, the time of "real" phone usage can be calculated,
# i.e. how long the screen was unlocked.
# Another example might be the average time between screen unlocks and/or screen status checks.
pass
"""
Calculates time statistics related to device usage.
Parameters
----------
df_screen: pd.DataFrame
A dataframe containing screen data
Returns
-------
A new dataframe indexed by device_id and participant_id containing the followig collumns:
* total_usage_time: sum of daily timespans between the last and
the first event reported by the screen sensor measured in milliseconds
* real_usage_time: duration of time during which the device was actually in use,
i.e. the total duration of sequences identified by the function identify_screen_sequence
* real_usage_time_percentage: real_usage_time / total_usage_time
* average_time_between_unlocks
* average_time_between_checks
* average_check_duration
* average_unlock_duration
"""
sequences_df = identify_screen_sequence(df_screen)
# Fit timestamps to dates
sequences_df["date"] = pd.to_datetime(sequences_df.beginning, unit="ms").dt.date
# Calculate total_usage_time, real_usage_time and real_usage_time_percentage
usage_time_df = (
sequences_df.groupby(["device_id","participant_id", "date"])
[["beginning", "end", "duration"]]
.agg({"beginning":"min", "end":"max", "duration":"sum"})
.assign(
total_usage_time=lambda x: x.end - x.beginning
)
.drop(["beginning", "end"], axis=1)
.rename(columns={"duration":"real_usage_time"})
.groupby(["device_id", "participant_id"])
.agg({"real_usage_time":"sum","total_usage_time":"sum"})
.assign(
real_usage_time_percentage=lambda x: x.real_usage_time / x.total_usage_time
)
)
# Calculate time_between_unlocks
time_between_unlocks_df = (
sequences_df[sequences_df["sequence_type"] == "unlock"]
.sort_values(["participant_id", "device_id", "beginning"])
)
time_between_unlocks_df = (
time_between_unlocks_df
.assign(end_ = time_between_unlocks_df.groupby("device_id")["end"].shift(1))
.assign(time_between_unlocks=lambda x: x.beginning - x.end_)
.groupby(["device_id", "participant_id"])
.agg({"time_between_unlocks":"mean"})
.rename(columns={"time_between_unlocks":"average_time_between_unlocks"})
)
# Calculate time_between_checks
time_between_checks_df = (
sequences_df[sequences_df["sequence_type"] == "check"]
.sort_values(["participant_id", "device_id", "beginning"])
)
time_between_checks_df = (
time_between_checks_df
.assign(end_ = time_between_checks_df.groupby("device_id")["end"].shift(1))
.assign(time_between_checks=lambda x: x.beginning - x.end_)
.groupby(["device_id", "participant_id"])
.agg({"time_between_checks":"mean"})
.rename(columns={"time_between_checks":"average_time_between_checks"})
)
# Calculate average_check_time and average_unlock_time
average_duration_df = (
sequences_df
.groupby(["device_id", "participant_id", "sequence_type"])
.agg(
{"duration": (lambda x: int(np.mean(x)))}
)
.unstack()
)
# Merge the four newely created dataframes
merged = usage_time_df.merge(
time_between_unlocks_df,
on=["device_id", "participant_id"]
).merge(
time_between_checks_df,
on=["device_id", "participant_id"]
).merge(
average_duration_df,
on=["device_id", "participant_id"]
).rename(
columns={
("duration","unlock"):"average_unlock_duration",
("duration","check"):"average_check_duration"
}
)
return merged

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long