# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% import pandas as pd from rapids.src.features.utils.utils import chunk_episodes # %% phone_data_yield = pd.read_csv( "../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv", parse_dates=["local_date_time"], ) time_segments_labels = pd.read_csv( "../rapids/data/interim/time_segments/p011_time_segments_labels.csv" ) # %% phone_data_yield["assigned_segments"] = phone_data_yield[ "assigned_segments" ].str.replace(r"_RR\d+SS#", "#", regex=True) time_segments_labels["label"] = time_segments_labels["label"].str.replace( r"_RR\d+SS$", "", regex=True ) # %% tags=[] def filter_data_by_segment(data, time_segment_current): data.dropna(subset=["assigned_segments"], inplace=True) if data.shape[0] == 0: # data is empty data["local_segment"] = data["timestamps_segment"] = None return data datetime_regex = ( r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}" ) timestamps_regex = r"[0-9]{13}" segment_regex = r"\[({}#{},{};{},{})\]".format( time_segment_current, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex, ) data["local_segment"] = data["assigned_segments"].str.extract( segment_regex, expand=True ) data = data.drop(columns=["assigned_segments"]) data = data.dropna(subset=["local_segment"]) if ( data.shape[0] == 0 ): # there are no rows belonging to time_segment after droping na data["timestamps_segment"] = None else: data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split( pat=";", n=1, expand=True ) # chunk episodes if ( (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns) ): data = chunk_episodes(data) return data # %% tags=[] time_segment = "daily" phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment) # %% phone_data_yield.tail() # %% phone_data_yield_per_segment.tail() # %% def getDataForPlot(phone_data_yield_per_segment): # calculate the length (in minute) of per segment instance phone_data_yield_per_segment["length"] = ( phone_data_yield_per_segment["timestamps_segment"] .str.split(",") .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60))) ) # calculate the number of sensors logged at least one row of data per minute. phone_data_yield_per_segment = ( phone_data_yield_per_segment.groupby( ["local_segment", "length", "local_date", "local_hour", "local_minute"] )[["sensor", "local_date_time"]] .max() .reset_index() ) # extract local start datetime of the segment from "local_segment" column phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime( phone_data_yield_per_segment["local_segment"].apply( lambda x: x.split("#")[1].split(",")[0] ) ) # calculate the number of minutes after local start datetime of the segment phone_data_yield_per_segment["minutes_after_segment_start"] = ( ( phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"] ) / pd.Timedelta(minutes=1) ).astype("int") # impute missing rows with 0 columns_for_full_index = phone_data_yield_per_segment[ ["local_segment_start_datetimes", "length"] ].drop_duplicates(keep="first") columns_for_full_index = columns_for_full_index.apply( lambda row: [ [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1) ], axis=1, ) full_index = [] for columns in columns_for_full_index: full_index = full_index + columns full_index = pd.MultiIndex.from_tuples( full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start"), ) phone_data_yield_per_segment = ( phone_data_yield_per_segment.set_index( ["local_segment_start_datetimes", "minutes_after_segment_start"] ) .reindex(full_index) .reset_index() .fillna(0) ) # transpose the dataframe per local start datetime of the segment # and discard the useless index layer phone_data_yield_per_segment = phone_data_yield_per_segment.groupby( "local_segment_start_datetimes" )[["minutes_after_segment_start", "sensor"]].apply( lambda x: x.set_index("minutes_after_segment_start").transpose() ) phone_data_yield_per_segment.index = ( phone_data_yield_per_segment.index.get_level_values( "local_segment_start_datetimes" ) ) return phone_data_yield_per_segment # %% data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment) # %% # calculate the length (in minute) of per segment instance phone_data_yield_per_segment["length"] = ( phone_data_yield_per_segment["timestamps_segment"] .str.split(",") .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60))) ) # %% phone_data_yield_per_segment.tail() # %% # calculate the number of sensors logged at least one row of data per minute. phone_data_yield_per_segment = ( phone_data_yield_per_segment.groupby( ["local_segment", "length", "local_date", "local_hour", "local_minute"] )[["sensor", "local_date_time"]] .max() .reset_index() ) # %% # extract local start datetime of the segment from "local_segment" column phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime( phone_data_yield_per_segment["local_segment"].apply( lambda x: x.split("#")[1].split(",")[0] ) ) # %% # calculate the number of minutes after local start datetime of the segment phone_data_yield_per_segment["minutes_after_segment_start"] = ( ( phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"] ) / pd.Timedelta(minutes=1) ).astype("int") # %% columns_for_full_index = phone_data_yield_per_segment[ ["local_segment_start_datetimes", "length"] ].drop_duplicates(keep="first") columns_for_full_index = columns_for_full_index.apply( lambda row: [ [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1) ], axis=1, ) # %% full_index = [] for columns in columns_for_full_index: full_index = full_index + columns full_index = pd.MultiIndex.from_tuples( full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start") ) # %% phone_data_yield_per_segment.tail() # %% [markdown] # # A workaround # %% phone_data_yield_per_segment[ "local_segment_start_datetimes", "minutes_after_segment_start" ] = phone_data_yield_per_segment[ ["local_segment_start_datetimes", "minutes_after_segment_start"] ].drop_duplicates( keep="first" ) # %% phone_data_yield_per_segment.set_index( ["local_segment_start_datetimes", "minutes_after_segment_start"], verify_integrity=True, ).reindex(full_index) # %% phone_data_yield_per_segment.head() # %% [markdown] # # Retry # %% def get_data_for_plot(phone_data_yield_per_segment): # calculate the length (in minute) of per segment instance phone_data_yield_per_segment["length"] = ( phone_data_yield_per_segment["timestamps_segment"] .str.split(",") .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60))) ) # calculate the number of sensors logged at least one row of data per minute. phone_data_yield_per_segment = ( phone_data_yield_per_segment.groupby( ["local_segment", "length", "local_date", "local_hour", "local_minute"] )[["sensor", "local_date_time"]] .max() .reset_index() ) # extract local start datetime of the segment from "local_segment" column phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime( phone_data_yield_per_segment["local_segment"].apply( lambda x: x.split("#")[1].split(",")[0] ) ) # calculate the number of minutes after local start datetime of the segment phone_data_yield_per_segment["minutes_after_segment_start"] = ( ( phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"] ) / pd.Timedelta(minutes=1) ).astype("int") # impute missing rows with 0 columns_for_full_index = phone_data_yield_per_segment[ ["local_segment_start_datetimes", "length"] ].drop_duplicates(keep="first") columns_for_full_index = columns_for_full_index.apply( lambda row: [ [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1) ], axis=1, ) full_index = [] for columns in columns_for_full_index: full_index = full_index + columns full_index = pd.MultiIndex.from_tuples( full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start"), ) phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates( subset=["local_segment_start_datetimes", "minutes_after_segment_start"], keep="first", ) phone_data_yield_per_segment = ( phone_data_yield_per_segment.set_index( ["local_segment_start_datetimes", "minutes_after_segment_start"] ) .reindex(full_index) .reset_index() .fillna(0) ) # transpose the dataframe per local start datetime of the segment # and discard the useless index layer phone_data_yield_per_segment = phone_data_yield_per_segment.groupby( "local_segment_start_datetimes" )[["minutes_after_segment_start", "sensor"]].apply( lambda x: x.set_index("minutes_after_segment_start").transpose() ) phone_data_yield_per_segment.index = ( phone_data_yield_per_segment.index.get_level_values( "local_segment_start_datetimes" ) ) return phone_data_yield_per_segment # %% phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment) # %% data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment) # %%