70 lines
3.9 KiB
Python
70 lines
3.9 KiB
Python
"""This file is TEMPORARY and intended for testing main.py
|
|
"""
|
|
|
|
def filter_data_by_segment(data, time_segment):
|
|
data.dropna(subset=["assigned_segments"], inplace=True)
|
|
if(data.shape[0] == 0): # data is empty
|
|
data["local_segment"] = data["timestamps_segment"] = None
|
|
return data
|
|
|
|
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
|
timestamps_regex = "[0-9]{13}"
|
|
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
|
|
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
|
|
data = data.drop(columns=["assigned_segments"])
|
|
data = data.dropna(subset = ["local_segment"])
|
|
if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
|
|
data["timestamps_segment"] = None
|
|
else:
|
|
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
|
|
|
|
# chunk episodes
|
|
if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
|
|
data = chunk_episodes(data)
|
|
|
|
return data
|
|
|
|
def chunk_episodes(sensor_episodes):
|
|
import copy
|
|
import pandas as pd
|
|
|
|
# Deduplicate episodes
|
|
# Drop rows where segments of start_timestamp and end_timestamp are the same
|
|
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
|
|
|
|
# Delete useless columns
|
|
for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
|
|
del sensor_episodes[drop_col]
|
|
|
|
# Avoid SettingWithCopyWarning
|
|
sensor_episodes = sensor_episodes.copy()
|
|
|
|
# Unix timestamp for current segment in milliseconds
|
|
sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
|
|
|
|
# Compute chunked timestamp
|
|
sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
|
|
sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
|
|
|
|
# Compute duration: intersection of current row and segment
|
|
sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
|
|
|
|
# Merge episodes
|
|
cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
|
|
|
|
sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
|
|
merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
|
|
|
|
merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
|
|
merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
|
|
|
|
merged_sensor_episodes.reset_index(inplace=True)
|
|
|
|
# Compute datetime
|
|
merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
|
|
merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
|
|
|
|
merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
|
|
merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
|
|
|
|
return merged_sensor_episodes |