import pandas as pd import numpy as np import itertools from scipy.stats import entropy def compute_metrics(filtered_data, apps_type, metrics, apps_features): if "count" in metrics: apps_features["apps_" + day_segment + "_count" + apps_type] = filtered_data.groupby(["local_date"]).count()["timestamp"] if "timeoffirstuse" in metrics: time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_date", keep="first").set_index("local_date") apps_features["apps_" + day_segment + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] if "timeoflastuse" in metrics: time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_date", keep="first").set_index("local_date") apps_features["apps_" + day_segment + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] if "frequencyentropy" in metrics: apps_with_count = filtered_data.groupby(["local_date","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() apps_features["apps_" + day_segment + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_date")["timestamp"].agg(entropy) return apps_features apps_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"]) day_segment = snakemake.params["day_segment"] single_categories = snakemake.params["single_categories"] multiple_categories_with_genres = snakemake.params["multiple_categories"] single_apps = snakemake.params["single_apps"] excluded_categories = snakemake.params["excluded_categories"] excluded_apps = snakemake.params["excluded_apps"] metrics = snakemake.params["metrics"] single_categories = list(set(single_categories) - set(excluded_categories)) multiple_categories = list(multiple_categories_with_genres.keys() - set(excluded_categories)) apps = list(set(single_apps) - set(excluded_apps)) # exclude categories in the excluded_categories list if "system_apps" in excluded_categories: apps_data = apps_data[apps_data["is_system_app"] == 0] apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)] # exclude apps in the excluded_apps list apps_data = apps_data[~apps_data["application_name"].isin(excluded_apps)] # deep copy the apps_data for the top1global computation apps_data_global = apps_data.copy() apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(metric) for metric in itertools.product(metrics, single_categories + multiple_categories + apps)]]) if not apps_data.empty: apps_features = pd.DataFrame() if day_segment != "daily": apps_data =apps_data[apps_data["local_day_segment"] == day_segment] # single category for sc in single_categories: if sc == "all": apps_features = compute_metrics(apps_data, "all", metrics, apps_features) else: filtered_data = apps_data[apps_data["genre"].isin([sc])] apps_features = compute_metrics(filtered_data, sc, metrics, apps_features) # multiple category for mc in multiple_categories: filtered_data = apps_data[apps_data["genre"].isin(multiple_categories_with_genres[mc])] apps_features = compute_metrics(filtered_data, mc, metrics, apps_features) # single apps for app in apps: col_name = app if app == "top1global": # get the most used app apps_with_count = apps_data_global.groupby(["local_date","package_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() app = apps_with_count.iloc[0]["package_name"] col_name = "top1global" filtered_data = apps_data[apps_data["package_name"].isin([app])] apps_features = compute_metrics(filtered_data, col_name, metrics, apps_features) apps_features = apps_features.reset_index() apps_features.to_csv(snakemake.output[0], index=False)