stress_at_work_analysis/exploration/debug_heatmap.py

338 lines
10 KiB
Python

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import pandas as pd
from rapids.src.features.utils.utils import chunk_episodes
# %%
phone_data_yield = pd.read_csv(
"../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv",
parse_dates=["local_date_time"],
)
time_segments_labels = pd.read_csv(
"../rapids/data/interim/time_segments/p011_time_segments_labels.csv"
)
# %%
phone_data_yield["assigned_segments"] = phone_data_yield[
"assigned_segments"
].str.replace(r"_RR\d+SS#", "#", regex=True)
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
r"_RR\d+SS$", "", regex=True
)
# %% tags=[]
def filter_data_by_segment(data, time_segment_current):
data.dropna(subset=["assigned_segments"], inplace=True)
if data.shape[0] == 0: # data is empty
data["local_segment"] = data["timestamps_segment"] = None
return data
datetime_regex = (
r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
)
timestamps_regex = r"[0-9]{13}"
segment_regex = r"\[({}#{},{};{},{})\]".format(
time_segment_current,
datetime_regex,
datetime_regex,
timestamps_regex,
timestamps_regex,
)
data["local_segment"] = data["assigned_segments"].str.extract(
segment_regex, expand=True
)
data = data.drop(columns=["assigned_segments"])
data = data.dropna(subset=["local_segment"])
if (
data.shape[0] == 0
): # there are no rows belonging to time_segment after droping na
data["timestamps_segment"] = None
else:
data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split(
pat=";", n=1, expand=True
)
# chunk episodes
if (
(not data.empty)
and ("start_timestamp" in data.columns)
and ("end_timestamp" in data.columns)
):
data = chunk_episodes(data)
return data
# %% tags=[]
time_segment = "daily"
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
# %%
phone_data_yield.tail()
# %%
phone_data_yield_per_segment.tail()
# %%
def getDataForPlot(phone_data_yield_per_segment):
# calculate the length (in minute) of per segment instance
phone_data_yield_per_segment["length"] = (
phone_data_yield_per_segment["timestamps_segment"]
.str.split(",")
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
)
# calculate the number of sensors logged at least one row of data per minute.
phone_data_yield_per_segment = (
phone_data_yield_per_segment.groupby(
["local_segment", "length", "local_date", "local_hour", "local_minute"]
)[["sensor", "local_date_time"]]
.max()
.reset_index()
)
# extract local start datetime of the segment from "local_segment" column
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
phone_data_yield_per_segment["local_segment"].apply(
lambda x: x.split("#")[1].split(",")[0]
)
)
# calculate the number of minutes after local start datetime of the segment
phone_data_yield_per_segment["minutes_after_segment_start"] = (
(
phone_data_yield_per_segment["local_date_time"]
- phone_data_yield_per_segment["local_segment_start_datetimes"]
)
/ pd.Timedelta(minutes=1)
).astype("int")
# impute missing rows with 0
columns_for_full_index = phone_data_yield_per_segment[
["local_segment_start_datetimes", "length"]
].drop_duplicates(keep="first")
columns_for_full_index = columns_for_full_index.apply(
lambda row: [
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
],
axis=1,
)
full_index = []
for columns in columns_for_full_index:
full_index = full_index + columns
full_index = pd.MultiIndex.from_tuples(
full_index,
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
)
phone_data_yield_per_segment = (
phone_data_yield_per_segment.set_index(
["local_segment_start_datetimes", "minutes_after_segment_start"]
)
.reindex(full_index)
.reset_index()
.fillna(0)
)
# transpose the dataframe per local start datetime of the segment
# and discard the useless index layer
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
"local_segment_start_datetimes"
)[["minutes_after_segment_start", "sensor"]].apply(
lambda x: x.set_index("minutes_after_segment_start").transpose()
)
phone_data_yield_per_segment.index = (
phone_data_yield_per_segment.index.get_level_values(
"local_segment_start_datetimes"
)
)
return phone_data_yield_per_segment
# %%
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
# %%
# calculate the length (in minute) of per segment instance
phone_data_yield_per_segment["length"] = (
phone_data_yield_per_segment["timestamps_segment"]
.str.split(",")
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
)
# %%
phone_data_yield_per_segment.tail()
# %%
# calculate the number of sensors logged at least one row of data per minute.
phone_data_yield_per_segment = (
phone_data_yield_per_segment.groupby(
["local_segment", "length", "local_date", "local_hour", "local_minute"]
)[["sensor", "local_date_time"]]
.max()
.reset_index()
)
# %%
# extract local start datetime of the segment from "local_segment" column
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
phone_data_yield_per_segment["local_segment"].apply(
lambda x: x.split("#")[1].split(",")[0]
)
)
# %%
# calculate the number of minutes after local start datetime of the segment
phone_data_yield_per_segment["minutes_after_segment_start"] = (
(
phone_data_yield_per_segment["local_date_time"]
- phone_data_yield_per_segment["local_segment_start_datetimes"]
)
/ pd.Timedelta(minutes=1)
).astype("int")
# %%
columns_for_full_index = phone_data_yield_per_segment[
["local_segment_start_datetimes", "length"]
].drop_duplicates(keep="first")
columns_for_full_index = columns_for_full_index.apply(
lambda row: [
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
],
axis=1,
)
# %%
full_index = []
for columns in columns_for_full_index:
full_index = full_index + columns
full_index = pd.MultiIndex.from_tuples(
full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")
)
# %%
phone_data_yield_per_segment.tail()
# %% [markdown]
# # A workaround
# %%
phone_data_yield_per_segment[
"local_segment_start_datetimes", "minutes_after_segment_start"
] = phone_data_yield_per_segment[
["local_segment_start_datetimes", "minutes_after_segment_start"]
].drop_duplicates(
keep="first"
)
# %%
phone_data_yield_per_segment.set_index(
["local_segment_start_datetimes", "minutes_after_segment_start"],
verify_integrity=True,
).reindex(full_index)
# %%
phone_data_yield_per_segment.head()
# %% [markdown]
# # Retry
# %%
def get_data_for_plot(phone_data_yield_per_segment):
# calculate the length (in minute) of per segment instance
phone_data_yield_per_segment["length"] = (
phone_data_yield_per_segment["timestamps_segment"]
.str.split(",")
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
)
# calculate the number of sensors logged at least one row of data per minute.
phone_data_yield_per_segment = (
phone_data_yield_per_segment.groupby(
["local_segment", "length", "local_date", "local_hour", "local_minute"]
)[["sensor", "local_date_time"]]
.max()
.reset_index()
)
# extract local start datetime of the segment from "local_segment" column
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
phone_data_yield_per_segment["local_segment"].apply(
lambda x: x.split("#")[1].split(",")[0]
)
)
# calculate the number of minutes after local start datetime of the segment
phone_data_yield_per_segment["minutes_after_segment_start"] = (
(
phone_data_yield_per_segment["local_date_time"]
- phone_data_yield_per_segment["local_segment_start_datetimes"]
)
/ pd.Timedelta(minutes=1)
).astype("int")
# impute missing rows with 0
columns_for_full_index = phone_data_yield_per_segment[
["local_segment_start_datetimes", "length"]
].drop_duplicates(keep="first")
columns_for_full_index = columns_for_full_index.apply(
lambda row: [
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
],
axis=1,
)
full_index = []
for columns in columns_for_full_index:
full_index = full_index + columns
full_index = pd.MultiIndex.from_tuples(
full_index,
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
)
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
keep="first",
)
phone_data_yield_per_segment = (
phone_data_yield_per_segment.set_index(
["local_segment_start_datetimes", "minutes_after_segment_start"]
)
.reindex(full_index)
.reset_index()
.fillna(0)
)
# transpose the dataframe per local start datetime of the segment
# and discard the useless index layer
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
"local_segment_start_datetimes"
)[["minutes_after_segment_start", "sensor"]].apply(
lambda x: x.set_index("minutes_after_segment_start").transpose()
)
phone_data_yield_per_segment.index = (
phone_data_yield_per_segment.index.get_level_values(
"local_segment_start_datetimes"
)
)
return phone_data_yield_per_segment
# %%
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
# %%
data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
# %%