From c4aacfffe103029511edd13051f1ebe9dcfaa32f Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 15 Dec 2021 16:03:49 +0100 Subject: [PATCH] Debug a ValueError in RAPIDS and add demo. --- exploration/debug_heatmap.py | 243 +++++++++++++++++++++++++++++++++++ rapids | 2 +- 2 files changed, 244 insertions(+), 1 deletion(-) create mode 100644 exploration/debug_heatmap.py diff --git a/exploration/debug_heatmap.py b/exploration/debug_heatmap.py new file mode 100644 index 0000000..e7df82f --- /dev/null +++ b/exploration/debug_heatmap.py @@ -0,0 +1,243 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% +import os, sys +import importlib +import pandas as pd +import numpy as np + +# import plotly.graph_objects as go +from importlib import util +from pathlib import Path +import yaml + +# %% +phone_data_yield = pd.read_csv( + "../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv", + parse_dates=["local_date_time"], +) +time_segments_labels = pd.read_csv( + "../rapids/data/interim/time_segments/p011_time_segments_labels.csv" +) + +# %% +phone_data_yield["assigned_segments"] = phone_data_yield[ + "assigned_segments" +].str.replace(r"_RR\d+SS#", "#") +time_segments_labels["label"] = time_segments_labels["label"].str.replace( + r"_RR\d+SS$", "" +) + + +# %% tags=[] +def filter_data_by_segment(data, time_segment): + data.dropna(subset=["assigned_segments"], inplace=True) + if data.shape[0] == 0: # data is empty + data["local_segment"] = data["timestamps_segment"] = None + return data + + datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" + timestamps_regex = "[0-9]{13}" + segment_regex = "\[({}#{},{};{},{})\]".format( + time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex + ) + data["local_segment"] = data["assigned_segments"].str.extract( + segment_regex, expand=True + ) + data = data.drop(columns=["assigned_segments"]) + data = data.dropna(subset=["local_segment"]) + if ( + data.shape[0] == 0 + ): # there are no rows belonging to time_segment after droping na + data["timestamps_segment"] = None + else: + data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split( + pat=";", n=1, expand=True + ) + + # chunk episodes + if ( + (not data.empty) + and ("start_timestamp" in data.columns) + and ("end_timestamp" in data.columns) + ): + data = chunk_episodes(data) + + return data + + +# %% tags=[] +time_segment = "daily" +phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment) + +# %% +phone_data_yield.tail() + +# %% +phone_data_yield_per_segment.tail() + + +# %% +def getDataForPlot(phone_data_yield_per_segment): + # calculate the length (in minute) of per segment instance + phone_data_yield_per_segment["length"] = ( + phone_data_yield_per_segment["timestamps_segment"] + .str.split(",") + .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60))) + ) + # calculate the number of sensors logged at least one row of data per minute. + phone_data_yield_per_segment = ( + phone_data_yield_per_segment.groupby( + ["local_segment", "length", "local_date", "local_hour", "local_minute"] + )[["sensor", "local_date_time"]] + .max() + .reset_index() + ) + # extract local start datetime of the segment from "local_segment" column + phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime( + phone_data_yield_per_segment["local_segment"].apply( + lambda x: x.split("#")[1].split(",")[0] + ) + ) + # calculate the number of minutes after local start datetime of the segment + phone_data_yield_per_segment["minutes_after_segment_start"] = ( + ( + phone_data_yield_per_segment["local_date_time"] + - phone_data_yield_per_segment["local_segment_start_datetimes"] + ) + / pd.Timedelta(minutes=1) + ).astype("int") + + # impute missing rows with 0 + columns_for_full_index = phone_data_yield_per_segment[ + ["local_segment_start_datetimes", "length"] + ].drop_duplicates(keep="first") + columns_for_full_index = columns_for_full_index.apply( + lambda row: [ + [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1) + ], + axis=1, + ) + full_index = [] + for columns in columns_for_full_index: + full_index = full_index + columns + full_index = pd.MultiIndex.from_tuples( + full_index, + names=("local_segment_start_datetimes", "minutes_after_segment_start"), + ) + phone_data_yield_per_segment = ( + phone_data_yield_per_segment.set_index( + ["local_segment_start_datetimes", "minutes_after_segment_start"] + ) + .reindex(full_index) + .reset_index() + .fillna(0) + ) + + # transpose the dataframe per local start datetime of the segment and discard the useless index layer + phone_data_yield_per_segment = phone_data_yield_per_segment.groupby( + "local_segment_start_datetimes" + )[["minutes_after_segment_start", "sensor"]].apply( + lambda x: x.set_index("minutes_after_segment_start").transpose() + ) + phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values( + "local_segment_start_datetimes" + ) + return phone_data_yield_per_segment + + +# %% +data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment) + +# %% +# calculate the length (in minute) of per segment instance +phone_data_yield_per_segment["length"] = ( + phone_data_yield_per_segment["timestamps_segment"] + .str.split(",") + .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60))) +) + +# %% +phone_data_yield_per_segment.tail() + +# %% +# calculate the number of sensors logged at least one row of data per minute. +phone_data_yield_per_segment = ( + phone_data_yield_per_segment.groupby( + ["local_segment", "length", "local_date", "local_hour", "local_minute"] + )[["sensor", "local_date_time"]] + .max() + .reset_index() +) + +# %% +# extract local start datetime of the segment from "local_segment" column +phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime( + phone_data_yield_per_segment["local_segment"].apply( + lambda x: x.split("#")[1].split(",")[0] + ) +) + +# %% +# calculate the number of minutes after local start datetime of the segment +phone_data_yield_per_segment["minutes_after_segment_start"] = ( + ( + phone_data_yield_per_segment["local_date_time"] + - phone_data_yield_per_segment["local_segment_start_datetimes"] + ) + / pd.Timedelta(minutes=1) +).astype("int") + +# %% +columns_for_full_index = phone_data_yield_per_segment[ + ["local_segment_start_datetimes", "length"] +].drop_duplicates(keep="first") +columns_for_full_index = columns_for_full_index.apply( + lambda row: [ + [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1) + ], + axis=1, +) + +# %% +full_index = [] +for columns in columns_for_full_index: + full_index = full_index + columns +full_index = pd.MultiIndex.from_tuples( + full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start") +) + +# %% +phone_data_yield_per_segment.tail() + +# %% [markdown] +# # A workaround + +# %% +phone_data_yield_per_segment = phone_data_yield_per_segment[ + ["local_segment_start_datetimes", "minutes_after_segment_start"] +].drop_duplicates(keep="first") + +# %% +phone_data_yield_per_segment.set_index( + ["local_segment_start_datetimes", "minutes_after_segment_start"], + verify_integrity=True, +).reindex(full_index) + +# %% +phone_data_yield_per_segment.head() + +# %% diff --git a/rapids b/rapids index 4485c4c..d2ed73d 160000 --- a/rapids +++ b/rapids @@ -1 +1 @@ -Subproject commit 4485c4c95e6eddad00fb6b5221d2946930394970 +Subproject commit d2ed73dccfac65ce503c1b510182fe5ef1516508