Try a different approach for preprocessing ESMs.

It is important that this follows generic RAPIDS pattern.
In the subsequent step of calculating features,
there is an expected file and folder structure of data/interim.
See rules/common.smk/find_features_files()
labels
junos 2022-04-05 18:02:31 +02:00
parent ed298a9479
commit 99245afca3
3 changed files with 20 additions and 16 deletions

View File

@ -168,8 +168,7 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
if config["PHONE_ESM"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
for scale in config["PHONE_ESM"]["PROVIDERS"][provider]["SCALES"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_{scale}_clean.csv",pid=config["PIDS"],scale=scale))
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv",pid=config["PIDS"],provider_key=provider.lower()))
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")

View File

@ -327,8 +327,10 @@ rule conversation_r_features:
rule preprocess_esm:
input: "data/raw/{pid}/phone_esm_with_datetime.csv"
params:
questionnaire_name = "{scale}"
output: "data/interim/{pid}/phone_esm_{scale}_clean.csv"
provider=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key="{provider_key}",
sensor_key="phone_esm"
output: "data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv"
script:
"../src/features/phone_esm/straw/preprocess.py"

View File

@ -1,21 +1,24 @@
from esm_preprocess import *
from esm_JCQ import reverse_jcq_demand_control_scoring
questionnaire_name = snakemake.params["questionnaire_name"]
requested_scales = provider["SCALES"]
df_esm = pd.read_csv(snakemake.input[0])
df_esm_preprocessed = preprocess_esm(df_esm)
try:
questionnaire_id = QUESTIONNAIRE_IDS[questionnaire_name]
except ValueError:
if not all([scale in QUESTIONNAIRE_IDS for scale in requested_scales]):
unknown_scales = set(requested_scales) - set(QUESTIONNAIRE_IDS.keys())
print("The requested questionnaire name should be one of the following:")
print(QUESTIONNAIRE_IDS.keys())
else:
df_esm_selected = df_esm_preprocessed[df_esm_preprocessed["questionnaire_id"] == questionnaire_id]
df_esm_clean = clean_up_esm(df_esm_selected)
if questionnaire_name.startswith("JCQ"):
df_esm_reversed = reverse_jcq_demand_control_scoring(df_esm_clean)
df_esm_reversed.to_csv(snakemake.output[0])
else:
df_esm_clean.to_csv(snakemake.output[0])
raise ValueError("You requested scales not collected: ", unknown_scales)
df_esm_clean = clean_up_esm(df_esm_preprocessed)
df_esm_clean["esm_user_score"] = df_esm_clean["esm_user_answer_numeric"]
for scale in requested_scales:
questionnaire_id = QUESTIONNAIRE_IDS[scale]
mask = df_esm_clean["questionnaire_id"] == questionnaire_id
if scale.startswith("JCQ"):
df_esm_clean.loc[mask] = reverse_jcq_demand_control_scoring(df_esm_clean.loc[mask])
df_esm_clean.to_csv(snakemake.output[0])