rapids/src/features/screen_features.py

25 lines
1.6 KiB
Python

import pandas as pd
import itertools
from screen.screen_base import base_screen_features
screen_data = pd.read_csv(snakemake.input["screen_deltas"], parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"])
phone_sensed_bins = pd.read_csv(snakemake.input["phone_sensed_bins"], parse_dates=["local_date"], index_col="local_date")
phone_sensed_bins[phone_sensed_bins > 0] = 1
day_segment = snakemake.params["day_segment"]
screen_features = pd.DataFrame(columns=["local_date"])
params = {}
params["reference_hour_first_use"] = snakemake.params["reference_hour_first_use"]
params["bin_size"] = snakemake.params["bin_size"]
params["requested_features_deltas"] = snakemake.params["features_deltas"]
params["requested_episode_types"] = snakemake.params["episode_types"]
requested_features_deltas = ["firstuseafter" + "{0:0=2d}".format(params["reference_hour_first_use"]) if feature_name == "firstuseafter" else feature_name for feature_name in params["requested_features_deltas"]]
requested_features = ["".join(feature) for feature in itertools.product(requested_features_deltas, params["requested_episode_types"])]
screen_features = screen_features.merge(base_screen_features(screen_data, phone_sensed_bins, day_segment, params), on="local_date", how="outer")
assert len(requested_features) + 1 == screen_features.shape[1], "The number of features in the output dataframe (=" + str(screen_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your screen feature extraction functions"
screen_features.to_csv(snakemake.output[0], index=False)