partially resolved the grouping issue
parent
53df652d02
commit
47ecd4bc02
|
@ -356,7 +356,16 @@ class Proximity(Base, AWAREsensor):
|
||||||
|
|
||||||
|
|
||||||
class Screen(Base, AWAREsensor):
|
class Screen(Base, AWAREsensor):
|
||||||
|
"""
|
||||||
|
Contains the screen sensor information.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
screen_status: int
|
||||||
|
Screen status (0 – off, 1 – on, 2 – locked, 3 – unlocked)
|
||||||
|
"""
|
||||||
screen_status = Column(SmallInteger)
|
screen_status = Column(SmallInteger)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SMS(Base, AWAREsensor):
|
class SMS(Base, AWAREsensor):
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"python.linting.enabled": true,
|
||||||
|
"python.formatting.provider": "autopep8"
|
||||||
|
}
|
|
@ -4,7 +4,7 @@ import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from datetime import *
|
from typing import Tuple, ValuesView
|
||||||
|
|
||||||
from config.models import Participant, Screen
|
from config.models import Participant, Screen
|
||||||
from setup import db_engine, session
|
from setup import db_engine, session
|
||||||
|
@ -36,27 +36,67 @@ def get_screen_data(usernames: Collection) -> pd.DataFrame:
|
||||||
return df_screen
|
return df_screen
|
||||||
|
|
||||||
|
|
||||||
def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
def identify_screen_sequence(df_screen: pd.DataFrame, grouping: bool = False) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Identify interesting sequences (unlock, status check) and return them in a dataframe.
|
Identifes interesting sequences (unlock, status check) and returns them in a dataframe.
|
||||||
|
Transform the grouping of screen events (by day, hour...) into a grouping of sequences.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_screen: pd.DataFrame
|
df_screen: pd.DataFrame
|
||||||
A dataframe containing screen data
|
A dataframe containing screen data and a column "group".
|
||||||
|
N.B.: the values in the column "group" must be of a comparable type (e.g. int, datetime.date etc.)
|
||||||
|
grouping:
|
||||||
|
A boolean value indicating whether the input df contains the columng "group".
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_sequences: pd.DataFrame
|
df_sequences: pd.DataFrame:
|
||||||
A dataframe containing information on screen sequences
|
A dataframe containing information on screen sequences
|
||||||
|
|
||||||
Columns:
|
Columns:
|
||||||
* participant_id
|
- participant_id
|
||||||
* device_id
|
- device_id
|
||||||
* sequence_type: unlock/check
|
- seq_id: an unique id assigned to each sequence
|
||||||
* beginning: beginning of unlock/check in miliseconds since 1970
|
- sequence_type: unlock/check
|
||||||
* end: end of unlock/check in miliseconds since 1970
|
- group: the group to which the sequence belongs, i.e. the timespan during which it has
|
||||||
* duration
|
occured. Note that in the case that it spans over a longer period of time,
|
||||||
|
the same sequence is assigned to multiple groups
|
||||||
|
- beginning_abs: beginning of unlock/check [ms since 1970]
|
||||||
|
- end_abs: end of unlock/check in [ms since 1970]
|
||||||
|
- duration_abs [ms]
|
||||||
|
- beginning_rel: beginning of a sequence relative to the group [ms since 1970]
|
||||||
|
- end_rel [ms since 1970]
|
||||||
|
- duration_rel [ms since 1970]
|
||||||
|
|
||||||
|
Legend
|
||||||
|
------
|
||||||
|
- 0: off
|
||||||
|
- 1: on
|
||||||
|
- 2: locked
|
||||||
|
- 3: unlocked
|
||||||
|
|
||||||
|
Grouping
|
||||||
|
--------
|
||||||
|
If the screen events of the input df are assigned a time structure, the identified sequences should also
|
||||||
|
be. If all of the screen events constituing a sequence are in the same group, assingning the sequence to
|
||||||
|
a group is trivial - it should be the group the events belong to. If, on the other hand, the situation is
|
||||||
|
trickier. As of the moment, the procedure is implemented as follows:
|
||||||
|
The relative beginning (relative to a certain group, i.e. timespan) is defined as the timestamp of the
|
||||||
|
first event belonging to the group in question. Relative end and relative duration are defined in a
|
||||||
|
similar fashion. This is, however, not optimal. We would namely wish, e.g., that the relative durations
|
||||||
|
of a given sequence would sum up to its absolute duration which is not yet the case.
|
||||||
|
TODO In order to achieve this, we would need to be given more information on the groups. The current
|
||||||
|
TODO constraint on the groups is only that they be comparable. This is insufficient since it is
|
||||||
|
TODO impossible to infer:
|
||||||
|
TODO - how many and which groups lie between two given groups
|
||||||
|
TODO - when in time does a certain group begin and when does it end
|
||||||
|
TODO In order to mitigate these issues, we would need to be given a complete list of groups and the
|
||||||
|
TODO groups should have the form of an interval (beginning, end).
|
||||||
|
|
||||||
|
In fact, under the presupposition that we will always be working with relatively big dataframes,
|
||||||
|
the requirement of being given a complete list of groups becomes unnecessary.
|
||||||
|
cf* the highlighted comment below.
|
||||||
|
|
||||||
Heuristics
|
Heuristics
|
||||||
----------
|
----------
|
||||||
|
@ -81,57 +121,42 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||||
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
The occurance of two consecutive "locked" events with no intermediate "unlocked" event
|
||||||
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
is an inconsistency, however due to its frequency it has to be dealt with in some way.
|
||||||
Since the time interval between the last two events of this sequence is commonly very
|
Since the time interval between the last two events of this sequence is commonly very
|
||||||
short (around 30ms), the 2 at the end should be interpreted as parto of the screen-check
|
short (around 30ms), the 2 at the end should be interpreted as part of the SCREEN-CHECK
|
||||||
sequence.
|
SEQUENCE.
|
||||||
ii) (2|0)130102
|
ii) (2|0)130102
|
||||||
This sequence is interpreted as a nested screen-check sequence (010) inside
|
This sequence is interpreted as a nested screen-check sequence (010) inside
|
||||||
a unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
|
an unlock sequence ((2|0)1302). Since the time interval between 0 and 1 is very
|
||||||
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
|
short (the device hasn't even had the time to lock), we say that 010 does not costitute a
|
||||||
proper check sequence and we therefore interpret the whole sequence as an unlock sequence.
|
proper check sequence and we therefore interpret the whole sequence as an UNLOCK SEQUENCE.
|
||||||
|
|
||||||
TODO: the function time_screen_sequence returns some weird values. For example, the average check time of
|
|
||||||
participant nr. 74 is several minutes and the real usage time percentage of participant nr. 78 is about 50%.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# If the time structure of sequences is not of interest, all events should be assigned to the same group
|
||||||
|
if not grouping:
|
||||||
|
df_screen["group"] = 0
|
||||||
|
|
||||||
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
df_screen.sort_values(["device_id", "timestamp"], inplace=True)
|
||||||
|
|
||||||
groups = df_screen.groupby("device_id")
|
# Create a df containing for each device a row with the following columns:
|
||||||
|
# - participant_id: the id of the participant the device belongs to
|
||||||
|
# - screen_status:
|
||||||
|
# a string representing the sequence of screen events
|
||||||
|
# in chronological order, e.g. "01301302130202130..."
|
||||||
|
# - timestamp:
|
||||||
|
# a list of timestamps of screen events
|
||||||
|
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
||||||
|
# - group:
|
||||||
|
# a list of groups to which the screen events
|
||||||
|
# belong, again, in cronological order
|
||||||
|
|
||||||
# Create a df containing, for each device, a string representing the sequence of
|
df_sequences_timestamps_groups = (
|
||||||
# screen events in chronological order, e.g. "01301302130202130..."
|
df_screen.groupby(["device_id", "participant_id"])
|
||||||
|
.agg({
|
||||||
df_screen_sequences = (
|
"screen_status": lambda list_: "".join([str(x) for x in list_]),
|
||||||
groups["screen_status"]
|
"timestamp": list,
|
||||||
.apply(list)
|
"group": list})
|
||||||
.apply(lambda list_: "".join([str(x) for x in list_]))
|
|
||||||
.to_frame()
|
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create a df containing, for each device, a list of timestamps of screen events
|
|
||||||
# in chronological order, e.g. [1581933295955, 1581933741144, ...]
|
|
||||||
|
|
||||||
df_timestamps = (
|
|
||||||
groups["timestamp"]
|
|
||||||
.apply(list)
|
|
||||||
.to_frame()
|
|
||||||
.reset_index()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create a df containing information to which participant the devices belong
|
|
||||||
|
|
||||||
df_participants = (
|
|
||||||
df_screen[["device_id", "participant_id"]]
|
|
||||||
.drop_duplicates()
|
|
||||||
.reset_index()
|
|
||||||
.drop("index", 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
df_merged = (
|
|
||||||
df_screen_sequences.merge(df_timestamps, on="device_id")
|
|
||||||
.merge(df_participants, on="device_id")
|
|
||||||
)
|
|
||||||
|
|
||||||
# Regex patterns implementing the heuristics described in the docstring.
|
# Regex patterns implementing the heuristics described in the docstring.
|
||||||
# Since the matching sequences can overlap, lookahead is used. Note that
|
# Since the matching sequences can overlap, lookahead is used. Note that
|
||||||
# the first event in a sequence isn't part of the group caught inside the
|
# the first event in a sequence isn't part of the group caught inside the
|
||||||
|
@ -139,142 +164,192 @@ def identify_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
||||||
# event of the previous sequence, so that the time interval between the first
|
# event of the previous sequence, so that the time interval between the first
|
||||||
# and the second event in a sequence is actually the time the device is not in use.
|
# and the second event in a sequence is actually the time the device is not in use.
|
||||||
|
|
||||||
unlock_pat = re.compile(r"(?=[0,2]((13|31)(0+|010)2))")
|
unlock_pat = re.compile(
|
||||||
check_pat = re.compile(r"(?=[0,2](10+))")
|
# Begin the lookahead group. Inside the lookahead group,
|
||||||
|
# first match either a 0 or a 2
|
||||||
|
r"(?=[0,2]"
|
||||||
|
# Begin the 1st capturing group, this is the one we are interested in.
|
||||||
|
# Match either a 13 or a 31.
|
||||||
|
r"((13|31)"
|
||||||
|
# Match either a (nonzero) sequence of consecutive 0s or a 010. Than, match
|
||||||
|
# a two. End the 1st capturing group. End the lookahead group.
|
||||||
|
r"(0+|010)2))"
|
||||||
|
)
|
||||||
|
check_pat = re.compile(
|
||||||
|
# Begin the lookahead group. Inside the lookahead group,
|
||||||
|
# first match either a 0 or a 2
|
||||||
|
r"(?=[0,2]"
|
||||||
|
# Begin the 1st capturing group. Capture a 1 succeeded by several 0s.
|
||||||
|
# End the 1st captouring group. End the lookahead group.
|
||||||
|
r"(10+))"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enumerate the groups based on increasing order in order to make iteration easier.
|
||||||
|
# //! N.B.: this is also a possible way to ease the constraint on the groups
|
||||||
|
# //! discussed in the docstring under "Grouping". Namely, when working with
|
||||||
|
# //! reasonably big dataframes, it can be confidently expected that for each
|
||||||
|
# //! group we are interested in there will be at least one screen event assigned
|
||||||
|
# //! to it. In this case, the following procedure will extract the complete list of groups.
|
||||||
|
|
||||||
|
def enumerate_groups(df: pd.DataFrame) -> Tuple[dict, dict]:
|
||||||
|
groups_list = sorted(list(set(df["group"].tolist())))
|
||||||
|
group_dict = dict(enumerate(groups_list))
|
||||||
|
inv_group_dict = dict([(snd, fst) for (fst, snd) in group_dict.items()])
|
||||||
|
return group_dict, inv_group_dict
|
||||||
|
|
||||||
|
# Try to sort and enumerate groups and raise an error if impossible
|
||||||
|
|
||||||
|
try:
|
||||||
|
group_dict, inv_group_dict = enumerate_groups(df_screen)
|
||||||
|
except TypeError as e:
|
||||||
|
raise e("Values in the column 'group' must be of a comparable type")
|
||||||
|
|
||||||
# Iterate over rows of the merged df and then for each row iterate over
|
# Iterate over rows of the merged df and then for each row iterate over
|
||||||
# regex mathes. For each match, create a dictionary containing information
|
# regex mathes. For each match, create a dictionary containing information
|
||||||
# on the matched sequence and append it to the list of rows. Lastly, create
|
# on the matched sequence and append it to the list of rows. Lastly, create
|
||||||
# a new dataframe from the list of rows and return it.
|
# a new dataframe from the list of rows and return it.
|
||||||
|
|
||||||
|
seq_id = 0
|
||||||
|
|
||||||
|
def identify_seq(regexp: re.Pattern, label: str) -> list:
|
||||||
|
"""
|
||||||
|
Iterates over rows of df_sequences_timestamps_groups, then, for each group
|
||||||
|
iterates over regex matches. For each regex match, i.e. for each identyfied
|
||||||
|
sequence, iterates over the groups over which the sequence spans (*cf docstring).
|
||||||
|
"""
|
||||||
|
nonlocal seq_id
|
||||||
|
rows_list = list()
|
||||||
|
for index, row in df_sequences_timestamps_groups.iterrows():
|
||||||
|
for match in regexp.finditer(row["screen_status"]):
|
||||||
|
beginning_index_abs, end_index_abs = match.start(1), match.end(1)
|
||||||
|
groups = set(row["group"][beginning_index_abs : end_index_abs])
|
||||||
|
group_ids = {inv_group_dict[grp] for grp in groups}
|
||||||
|
# TODO Here's part of the problem: the span of relevant groups consists solely
|
||||||
|
# TODO of those to which at least one screen event is assigned (in the whole df)
|
||||||
|
span = range(min(group_ids), max(group_ids) + 1)
|
||||||
|
for grp_id in span:
|
||||||
|
grp = group_dict[grp_id]
|
||||||
|
grp_indices = [
|
||||||
|
index for index
|
||||||
|
in range(beginning_index_abs, end_index_abs)
|
||||||
|
if row["group"][index] == grp
|
||||||
|
]
|
||||||
|
# TODO Here, we face the converse problem. It may happen that a sequence in fact
|
||||||
|
# TODO does span over a certain group although none of the events that constitute
|
||||||
|
# TODO it are assigned to it. In this case, there is no way to calculate the relative
|
||||||
|
# TODO beginning and end (which should just be the beginning and end of the group).
|
||||||
|
try:
|
||||||
|
beginning_index_rel, end_index_rel = min(grp_indices), max(grp_indices)
|
||||||
|
beginning_rel = row["timestamp"][beginning_index_rel]
|
||||||
|
end_rel = row["timestamp"][end_index_rel]
|
||||||
|
except ValueError:
|
||||||
|
beginning_rel = end_rel = pd.NA
|
||||||
|
beginning_abs = row["timestamp"][beginning_index_abs]
|
||||||
|
end_abs = row["timestamp"][end_index_abs - 1]
|
||||||
|
new_row_dict = {
|
||||||
|
"participant_id": row["participant_id"],
|
||||||
|
"device_id": row["device_id"],
|
||||||
|
"seq_id": seq_id,
|
||||||
|
"sequence_type": label,
|
||||||
|
"group": grp,
|
||||||
|
"beginning_abs": beginning_abs,
|
||||||
|
"end_abs": end_abs,
|
||||||
|
"duration_abs": end_abs - beginning_abs,
|
||||||
|
"beginning_rel": beginning_rel,
|
||||||
|
"end_rel": end_rel,
|
||||||
|
"duration_rel": end_rel - beginning_rel
|
||||||
|
}
|
||||||
|
rows_list.append(new_row_dict)
|
||||||
|
seq_id += 1
|
||||||
|
return rows_list
|
||||||
|
|
||||||
rows_list = list()
|
rows_unlock = identify_seq(unlock_pat, "unlock")
|
||||||
for index, row in df_merged.iterrows():
|
rows_check = identify_seq(check_pat, "check")
|
||||||
for match in unlock_pat.finditer(row["screen_status"]):
|
|
||||||
beginning = row["timestamp"][match.start(1)]
|
df_sequences = pd.DataFrame(rows_unlock + rows_check)
|
||||||
end = row["timestamp"][match.end(1) - 1]
|
|
||||||
new_row_dict = {
|
|
||||||
"participant_id": row["participant_id"],
|
|
||||||
"device_id": row["device_id"],
|
|
||||||
"sequence_type": "unlock",
|
|
||||||
"beginning": beginning,
|
|
||||||
"end": end
|
|
||||||
}
|
|
||||||
rows_list.append(new_row_dict)
|
|
||||||
for match in check_pat.finditer(row["screen_status"]):
|
|
||||||
beginning = row["timestamp"][match.start(1)]
|
|
||||||
end = row["timestamp"][match.end(1) - 1]
|
|
||||||
new_row_dict = {
|
|
||||||
"participant_id": row["participant_id"],
|
|
||||||
"device_id": row["device_id"],
|
|
||||||
"sequence_type": "check",
|
|
||||||
"beginning": beginning,
|
|
||||||
"end": end
|
|
||||||
}
|
|
||||||
rows_list.append(new_row_dict)
|
|
||||||
df_sequences = pd.DataFrame(rows_list)
|
|
||||||
df_sequences["duration"] = df_sequences["end"] - df_sequences["beginning"]
|
|
||||||
|
|
||||||
return df_sequences
|
return df_sequences
|
||||||
|
|
||||||
|
|
||||||
def time_screen_sequence(df_screen: pd.DataFrame) -> pd.DataFrame:
|
# def time_screen_sequence(df_screen: pd.DataFrame, groupby: str = "date") -> pd.DataFrame:
|
||||||
"""
|
# """
|
||||||
Calculates time statistics related to device usage.
|
# Calculates time statistics related to device usage.
|
||||||
|
|
||||||
Parameters
|
# Parameters
|
||||||
----------
|
# ----------
|
||||||
df_screen: pd.DataFrame
|
# df_screen: pd.DataFrame
|
||||||
A dataframe containing screen data
|
# A dataframe containing screen data
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
A new dataframe indexed by device_id and participant_id containing the followig collumns:
|
|
||||||
* total_usage_time: sum of daily timespans between the last and
|
|
||||||
the first event reported by the screen sensor measured in milliseconds
|
|
||||||
* real_usage_time: duration of time during which the device was actually in use,
|
|
||||||
i.e. the total duration of sequences identified by the function identify_screen_sequence
|
|
||||||
* real_usage_time_percentage: real_usage_time / total_usage_time
|
|
||||||
* average_time_between_unlocks
|
|
||||||
* average_time_between_checks
|
|
||||||
* average_check_duration
|
|
||||||
* average_unlock_duration
|
|
||||||
"""
|
|
||||||
|
|
||||||
sequences_df = identify_screen_sequence(df_screen)
|
# Returns
|
||||||
|
# -------
|
||||||
|
# A new dataframe indexed by device_id and participant_id containing the followig collumns:
|
||||||
|
# - total_usage_time: sum of daily timespans between the last and
|
||||||
|
# the first event reported by the screen sensor measured in milliseconds
|
||||||
|
# - real_usage_time: duration of time during which the device was actually in use,
|
||||||
|
# i.e. the total duration of sequences identified by the function identify_screen_sequence
|
||||||
|
# - real_usage_time_percentage: real_usage_time / total_usage_time
|
||||||
|
# - average_time_between_unlocks
|
||||||
|
# - average_time_between_checks
|
||||||
|
# - average_check_duration
|
||||||
|
# - average_unlock_duration
|
||||||
|
# """
|
||||||
|
|
||||||
# Fit timestamps to dates
|
# sequences_df = identify_screen_sequence(df_screen)
|
||||||
sequences_df["date"] = pd.to_datetime(sequences_df.beginning, unit="ms").dt.date
|
|
||||||
|
|
||||||
# Calculate total_usage_time, real_usage_time and real_usage_time_percentage
|
|
||||||
usage_time_df = (
|
|
||||||
sequences_df.groupby(["device_id","participant_id", "date"])
|
|
||||||
[["beginning", "end", "duration"]]
|
|
||||||
.agg({"beginning":"min", "end":"max", "duration":"sum"})
|
|
||||||
.assign(
|
|
||||||
total_usage_time=lambda x: x.end - x.beginning
|
|
||||||
)
|
|
||||||
.drop(["beginning", "end"], axis=1)
|
|
||||||
.rename(columns={"duration":"real_usage_time"})
|
|
||||||
.groupby(["device_id", "participant_id"])
|
|
||||||
.agg({"real_usage_time":"sum","total_usage_time":"sum"})
|
|
||||||
.assign(
|
|
||||||
real_usage_time_percentage=lambda x: x.real_usage_time / x.total_usage_time
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate time_between_unlocks
|
|
||||||
time_between_unlocks_df = (
|
|
||||||
sequences_df[sequences_df["sequence_type"] == "unlock"]
|
|
||||||
.sort_values(["participant_id", "device_id", "beginning"])
|
|
||||||
)
|
|
||||||
time_between_unlocks_df = (
|
|
||||||
time_between_unlocks_df
|
|
||||||
.assign(end_ = time_between_unlocks_df.groupby("device_id")["end"].shift(1))
|
|
||||||
.assign(time_between_unlocks=lambda x: x.beginning - x.end_)
|
|
||||||
.groupby(["device_id", "participant_id"])
|
|
||||||
.agg({"time_between_unlocks":"mean"})
|
|
||||||
.rename(columns={"time_between_unlocks":"average_time_between_unlocks"})
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate time_between_checks
|
# # Calculate the date of the beginning and of the end of a sequence.
|
||||||
time_between_checks_df = (
|
# # Drop those sequences which span over several days.
|
||||||
sequences_df[sequences_df["sequence_type"] == "check"]
|
# sequences_df["date_beginning"] = pd.to_datetime(
|
||||||
.sort_values(["participant_id", "device_id", "beginning"])
|
# sequences_df.beginning, unit="ms").dt.date
|
||||||
)
|
|
||||||
time_between_checks_df = (
|
|
||||||
time_between_checks_df
|
|
||||||
.assign(end_ = time_between_checks_df.groupby("device_id")["end"].shift(1))
|
|
||||||
.assign(time_between_checks=lambda x: x.beginning - x.end_)
|
|
||||||
.groupby(["device_id", "participant_id"])
|
|
||||||
.agg({"time_between_checks":"mean"})
|
|
||||||
.rename(columns={"time_between_checks":"average_time_between_checks"})
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate average_check_time and average_unlock_time
|
# sequences_df["date_end"] = pd.to_datetime(
|
||||||
average_duration_df = (
|
# sequences_df.end, unit="ms").dt.date
|
||||||
sequences_df
|
|
||||||
.groupby(["device_id", "participant_id", "sequence_type"])
|
|
||||||
.agg(
|
|
||||||
{"duration": (lambda x: int(np.mean(x)))}
|
|
||||||
)
|
|
||||||
.unstack()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Merge the four newely created dataframes
|
|
||||||
merged = usage_time_df.merge(
|
|
||||||
time_between_unlocks_df,
|
|
||||||
on=["device_id", "participant_id"]
|
|
||||||
).merge(
|
|
||||||
time_between_checks_df,
|
|
||||||
on=["device_id", "participant_id"]
|
|
||||||
).merge(
|
|
||||||
average_duration_df,
|
|
||||||
on=["device_id", "participant_id"]
|
|
||||||
).rename(
|
|
||||||
columns={
|
|
||||||
("duration","unlock"):"average_unlock_duration",
|
|
||||||
("duration","check"):"average_check_duration"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return merged
|
# sequences_df = (
|
||||||
|
# sequences_df
|
||||||
|
# [sequences_df["date_beginning"] == sequences_df["date_end"]]
|
||||||
|
# .drop(columns=["date_end"])
|
||||||
|
# .rename(columns={"date_beginning":"date"})
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Calculate the time the device was in use
|
||||||
|
# usage_time_df = (
|
||||||
|
# sequences_df.groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||||
|
# .agg({"duration":"sum"})
|
||||||
|
# .apply(lambda x: x//1000, "columns")
|
||||||
|
# .rename(columns={"duration":"usage_time"})
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Calculate the average time between sequences
|
||||||
|
# average_timedelta_df = (
|
||||||
|
# sequences_df.sort_values("beginning")
|
||||||
|
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||||
|
# .apply(
|
||||||
|
# lambda grp:
|
||||||
|
# grp.assign(end_shifted = grp["end"].shift(1))
|
||||||
|
# )
|
||||||
|
# .drop(columns=["participant_id", "device_id", "sequence_type", "date"])
|
||||||
|
# .droplevel(-1)
|
||||||
|
# .assign(average_timedelta = lambda x: x.beginning - x.end_shifted)
|
||||||
|
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||||
|
# .agg({"average_timedelta": lambda x: np.mean(x)//1000})
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Calculate the average duration of sequences
|
||||||
|
# average_duration_df = (
|
||||||
|
# sequences_df
|
||||||
|
# .groupby(["sequence_type", "participant_id", "device_id", "date"])
|
||||||
|
# .agg({"duration": (lambda x: np.mean(x)//1000)})
|
||||||
|
# .rename(columns={"duration":"average_duration"})
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Merge into a single dataframe
|
||||||
|
# merged = pd.merge(
|
||||||
|
# pd.merge(usage_time_df, average_timedelta_df, left_index=True, right_index=True),
|
||||||
|
# average_duration_df,
|
||||||
|
# left_index=True,
|
||||||
|
# right_index=True
|
||||||
|
# )
|
||||||
|
|
||||||
|
# return merged
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue