Compare commits
3 Commits
2e1e771b3d
...
de10269d36
Author | SHA1 | Date |
---|---|---|
junos | de10269d36 | |
junos | 7e8e922d71 | |
junos | c4aacfffe1 |
|
@ -4,4 +4,17 @@
|
||||||
<component name="PyCharmProfessionalAdvertiser">
|
<component name="PyCharmProfessionalAdvertiser">
|
||||||
<option name="shown" value="true" />
|
<option name="shown" value="true" />
|
||||||
</component>
|
</component>
|
||||||
|
<component name="RMarkdownSettings">
|
||||||
|
<option name="renderProfiles">
|
||||||
|
<map>
|
||||||
|
<entry key="file://$PROJECT_DIR$/rapids/src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd">
|
||||||
|
<value>
|
||||||
|
<RMarkdownRenderProfile>
|
||||||
|
<option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/rapids/src/visualization" />
|
||||||
|
</RMarkdownRenderProfile>
|
||||||
|
</value>
|
||||||
|
</entry>
|
||||||
|
</map>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
</project>
|
</project>
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,323 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.13.0
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: straw2analysis
|
||||||
|
# language: python
|
||||||
|
# name: straw2analysis
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import os, sys
|
||||||
|
import importlib
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# import plotly.graph_objects as go
|
||||||
|
from importlib import util
|
||||||
|
from pathlib import Path
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield = pd.read_csv(
|
||||||
|
"../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv",
|
||||||
|
parse_dates=["local_date_time"],
|
||||||
|
)
|
||||||
|
time_segments_labels = pd.read_csv(
|
||||||
|
"../rapids/data/interim/time_segments/p011_time_segments_labels.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield["assigned_segments"] = phone_data_yield[
|
||||||
|
"assigned_segments"
|
||||||
|
].str.replace(r"_RR\d+SS#", "#")
|
||||||
|
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
||||||
|
r"_RR\d+SS$", ""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# %% tags=[]
|
||||||
|
def filter_data_by_segment(data, time_segment):
|
||||||
|
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||||
|
if data.shape[0] == 0: # data is empty
|
||||||
|
data["local_segment"] = data["timestamps_segment"] = None
|
||||||
|
return data
|
||||||
|
|
||||||
|
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||||
|
timestamps_regex = "[0-9]{13}"
|
||||||
|
segment_regex = "\[({}#{},{};{},{})\]".format(
|
||||||
|
time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
|
||||||
|
)
|
||||||
|
data["local_segment"] = data["assigned_segments"].str.extract(
|
||||||
|
segment_regex, expand=True
|
||||||
|
)
|
||||||
|
data = data.drop(columns=["assigned_segments"])
|
||||||
|
data = data.dropna(subset=["local_segment"])
|
||||||
|
if (
|
||||||
|
data.shape[0] == 0
|
||||||
|
): # there are no rows belonging to time_segment after droping na
|
||||||
|
data["timestamps_segment"] = None
|
||||||
|
else:
|
||||||
|
data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split(
|
||||||
|
pat=";", n=1, expand=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# chunk episodes
|
||||||
|
if (
|
||||||
|
(not data.empty)
|
||||||
|
and ("start_timestamp" in data.columns)
|
||||||
|
and ("end_timestamp" in data.columns)
|
||||||
|
):
|
||||||
|
data = chunk_episodes(data)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# %% tags=[]
|
||||||
|
time_segment = "daily"
|
||||||
|
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield.tail()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment.tail()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def getDataForPlot(phone_data_yield_per_segment):
|
||||||
|
# calculate the length (in minute) of per segment instance
|
||||||
|
phone_data_yield_per_segment["length"] = (
|
||||||
|
phone_data_yield_per_segment["timestamps_segment"]
|
||||||
|
.str.split(",")
|
||||||
|
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
|
||||||
|
)
|
||||||
|
# calculate the number of sensors logged at least one row of data per minute.
|
||||||
|
phone_data_yield_per_segment = (
|
||||||
|
phone_data_yield_per_segment.groupby(
|
||||||
|
["local_segment", "length", "local_date", "local_hour", "local_minute"]
|
||||||
|
)[["sensor", "local_date_time"]]
|
||||||
|
.max()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
# extract local start datetime of the segment from "local_segment" column
|
||||||
|
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
|
||||||
|
phone_data_yield_per_segment["local_segment"].apply(
|
||||||
|
lambda x: x.split("#")[1].split(",")[0]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# calculate the number of minutes after local start datetime of the segment
|
||||||
|
phone_data_yield_per_segment["minutes_after_segment_start"] = (
|
||||||
|
(
|
||||||
|
phone_data_yield_per_segment["local_date_time"]
|
||||||
|
- phone_data_yield_per_segment["local_segment_start_datetimes"]
|
||||||
|
)
|
||||||
|
/ pd.Timedelta(minutes=1)
|
||||||
|
).astype("int")
|
||||||
|
|
||||||
|
# impute missing rows with 0
|
||||||
|
columns_for_full_index = phone_data_yield_per_segment[
|
||||||
|
["local_segment_start_datetimes", "length"]
|
||||||
|
].drop_duplicates(keep="first")
|
||||||
|
columns_for_full_index = columns_for_full_index.apply(
|
||||||
|
lambda row: [
|
||||||
|
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
full_index = []
|
||||||
|
for columns in columns_for_full_index:
|
||||||
|
full_index = full_index + columns
|
||||||
|
full_index = pd.MultiIndex.from_tuples(
|
||||||
|
full_index,
|
||||||
|
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||||
|
)
|
||||||
|
phone_data_yield_per_segment = (
|
||||||
|
phone_data_yield_per_segment.set_index(
|
||||||
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
|
)
|
||||||
|
.reindex(full_index)
|
||||||
|
.reset_index()
|
||||||
|
.fillna(0)
|
||||||
|
)
|
||||||
|
|
||||||
|
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||||
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||||
|
"local_segment_start_datetimes"
|
||||||
|
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||||
|
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||||
|
)
|
||||||
|
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||||
|
"local_segment_start_datetimes"
|
||||||
|
)
|
||||||
|
return phone_data_yield_per_segment
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# calculate the length (in minute) of per segment instance
|
||||||
|
phone_data_yield_per_segment["length"] = (
|
||||||
|
phone_data_yield_per_segment["timestamps_segment"]
|
||||||
|
.str.split(",")
|
||||||
|
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment.tail()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# calculate the number of sensors logged at least one row of data per minute.
|
||||||
|
phone_data_yield_per_segment = (
|
||||||
|
phone_data_yield_per_segment.groupby(
|
||||||
|
["local_segment", "length", "local_date", "local_hour", "local_minute"]
|
||||||
|
)[["sensor", "local_date_time"]]
|
||||||
|
.max()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# extract local start datetime of the segment from "local_segment" column
|
||||||
|
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
|
||||||
|
phone_data_yield_per_segment["local_segment"].apply(
|
||||||
|
lambda x: x.split("#")[1].split(",")[0]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# calculate the number of minutes after local start datetime of the segment
|
||||||
|
phone_data_yield_per_segment["minutes_after_segment_start"] = (
|
||||||
|
(
|
||||||
|
phone_data_yield_per_segment["local_date_time"]
|
||||||
|
- phone_data_yield_per_segment["local_segment_start_datetimes"]
|
||||||
|
)
|
||||||
|
/ pd.Timedelta(minutes=1)
|
||||||
|
).astype("int")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
columns_for_full_index = phone_data_yield_per_segment[
|
||||||
|
["local_segment_start_datetimes", "length"]
|
||||||
|
].drop_duplicates(keep="first")
|
||||||
|
columns_for_full_index = columns_for_full_index.apply(
|
||||||
|
lambda row: [
|
||||||
|
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
full_index = []
|
||||||
|
for columns in columns_for_full_index:
|
||||||
|
full_index = full_index + columns
|
||||||
|
full_index = pd.MultiIndex.from_tuples(
|
||||||
|
full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment.tail()
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # A workaround
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
|
||||||
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
|
].drop_duplicates(keep="first")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment.set_index(
|
||||||
|
["local_segment_start_datetimes", "minutes_after_segment_start"],
|
||||||
|
verify_integrity=True,
|
||||||
|
).reindex(full_index)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment.head()
|
||||||
|
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # Retry
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def getDataForPlot(phone_data_yield_per_segment):
|
||||||
|
# calculate the length (in minute) of per segment instance
|
||||||
|
phone_data_yield_per_segment["length"] = (
|
||||||
|
phone_data_yield_per_segment["timestamps_segment"]
|
||||||
|
.str.split(",")
|
||||||
|
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
|
||||||
|
)
|
||||||
|
# calculate the number of sensors logged at least one row of data per minute.
|
||||||
|
phone_data_yield_per_segment = (
|
||||||
|
phone_data_yield_per_segment.groupby(
|
||||||
|
["local_segment", "length", "local_date", "local_hour", "local_minute"]
|
||||||
|
)[["sensor", "local_date_time"]]
|
||||||
|
.max()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
# extract local start datetime of the segment from "local_segment" column
|
||||||
|
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
|
||||||
|
phone_data_yield_per_segment["local_segment"].apply(
|
||||||
|
lambda x: x.split("#")[1].split(",")[0]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# calculate the number of minutes after local start datetime of the segment
|
||||||
|
phone_data_yield_per_segment["minutes_after_segment_start"] = (
|
||||||
|
(
|
||||||
|
phone_data_yield_per_segment["local_date_time"]
|
||||||
|
- phone_data_yield_per_segment["local_segment_start_datetimes"]
|
||||||
|
)
|
||||||
|
/ pd.Timedelta(minutes=1)
|
||||||
|
).astype("int")
|
||||||
|
|
||||||
|
# impute missing rows with 0
|
||||||
|
columns_for_full_index = phone_data_yield_per_segment[
|
||||||
|
["local_segment_start_datetimes", "length"]
|
||||||
|
].drop_duplicates(keep="first")
|
||||||
|
columns_for_full_index = columns_for_full_index.apply(
|
||||||
|
lambda row: [
|
||||||
|
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
full_index = []
|
||||||
|
for columns in columns_for_full_index:
|
||||||
|
full_index = full_index + columns
|
||||||
|
full_index = pd.MultiIndex.from_tuples(
|
||||||
|
full_index,
|
||||||
|
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||||
|
)
|
||||||
|
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
|
||||||
|
phone_data_yield_per_segment = (
|
||||||
|
phone_data_yield_per_segment.set_index(
|
||||||
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
|
)
|
||||||
|
.reindex(full_index)
|
||||||
|
.reset_index()
|
||||||
|
.fillna(0)
|
||||||
|
)
|
||||||
|
|
||||||
|
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||||
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||||
|
"local_segment_start_datetimes"
|
||||||
|
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||||
|
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||||
|
)
|
||||||
|
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||||
|
"local_segment_start_datetimes"
|
||||||
|
)
|
||||||
|
return phone_data_yield_per_segment
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||||
|
|
||||||
|
# %%
|
|
@ -6,7 +6,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.11.4
|
# jupytext_version: 1.13.0
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -74,3 +74,29 @@ rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
|
||||||
# %%
|
# %%
|
||||||
with pd.option_context("display.max_rows", None, "display.max_columns", None):
|
with pd.option_context("display.max_rows", None, "display.max_columns", None):
|
||||||
display(df_category_not_found.loc[~rows_os_manufacturer])
|
display(df_category_not_found.loc[~rows_os_manufacturer])
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # Export categories
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# Rename all of "not_found" to "system" or "other".
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_app_categories_to_export = df_app_categories.copy()
|
||||||
|
rows_os_manufacturer_full = (df_app_categories_to_export["package_name"].str.contains(
|
||||||
|
"|".join(manufacturers + custom_rom + other), case=False
|
||||||
|
)) & (df_app_categories_to_export["play_store_genre"] == "not_found")
|
||||||
|
df_app_categories_to_export.loc[rows_os_manufacturer_full, "play_store_genre"] = "System"
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rows_not_found = (df_app_categories_to_export["play_store_genre"] == "not_found")
|
||||||
|
df_app_categories_to_export.loc[rows_not_found, "play_store_genre"] = "Other"
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_app_categories_to_export["play_store_genre"].value_counts()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_app_categories_to_export.rename(columns={"play_store_genre": "genre"},inplace=True)
|
||||||
|
df_app_categories_to_export.to_csv("../data/app_categories.csv", columns=["package_hash","genre"],index=False)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
2
rapids
2
rapids
|
@ -1 +1 @@
|
||||||
Subproject commit 4485c4c95e6eddad00fb6b5221d2946930394970
|
Subproject commit e5cc02501f629c96641dfd1bcd1f7fcfd0d55462
|
Loading…
Reference in New Issue