Select target columns (no parsing necessary).
parent
ac86221662
commit
50c0defca7
|
@ -422,7 +422,7 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
||||||
|
|
||||||
# Targets (labels)
|
# Targets (labels)
|
||||||
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||||
|
|
||||||
rule all:
|
rule all:
|
||||||
input:
|
input:
|
||||||
|
|
|
@ -650,5 +650,5 @@ PARAMS_FOR_ANALYSIS:
|
||||||
|
|
||||||
TARGET:
|
TARGET:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
SCALE: [positive_affect, negative_affect]
|
LABEL: PANAS_negative_affect_mean
|
||||||
|
|
||||||
|
|
|
@ -28,11 +28,12 @@ rule baseline_features:
|
||||||
script:
|
script:
|
||||||
"../src/data/baseline_features.py"
|
"../src/data/baseline_features.py"
|
||||||
|
|
||||||
rule parse_targets:
|
rule select_target:
|
||||||
input:
|
input:
|
||||||
targets = "data/processed/features/{pid}/phone_esm.csv",
|
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
|
||||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
params:
|
||||||
|
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||||
output:
|
output:
|
||||||
"data/processed/targets/{pid}/parsed_targets.csv"
|
"data/processed/models/individual_model/{pid}/input.csv"
|
||||||
script:
|
script:
|
||||||
"../src/models/parse_targets.py"
|
"../src/models/select_targets.py"
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
targets = pd.read_csv(snakemake.input["targets"])
|
|
||||||
|
|
||||||
targets.to_csv(snakemake.output[0], index=False)
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||||
|
|
||||||
|
column_names = cleaned_sensor_features.columns
|
||||||
|
esm_names_index = column_names.str.startswith("phone_esm_straw")
|
||||||
|
# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
|
||||||
|
esm_names = column_names[esm_names_index]
|
||||||
|
|
||||||
|
target_variable_name = esm_names.str.contains(snakemake.params["target_variable"])
|
||||||
|
if all(~target_variable_name):
|
||||||
|
raise ValueError("The requested target (", snakemake.params["target_variable"], ")cannot be found in the dataset.",
|
||||||
|
"Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
|
||||||
|
esm_names = esm_names[~target_variable_name]
|
||||||
|
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||||
|
|
||||||
|
model_input = cleaned_sensor_features.drop(esm_names, axis=1)
|
||||||
|
|
||||||
|
model_input.to_csv(snakemake.output[0], index=False)
|
Loading…
Reference in New Issue