Rename baseline features AGAIN.

Correct other mistakes.
labels
junos 2022-04-12 16:55:01 +02:00
parent 9ab0c8f289
commit 4ad261fae5
3 changed files with 18 additions and 16 deletions

View File

@ -415,7 +415,7 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
# Baseline features # Baseline features
if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]: if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
files_to_compute.extend("data/raw/baseline_merged.csv") files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
@ -423,7 +423,9 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
# Targets (labels) # Targets (labels)
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]: if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
files_to_compute.extend("data/processed/models/population_model/input.csv") files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
rule all: rule all:
input: input:

View File

@ -54,7 +54,7 @@ rule model_individual_baselines:
"data/processed/models/individual_model/{pid}/input.csv" "data/processed/models/individual_model/{pid}/input.csv"
params: params:
cv_method = "{cv_method}", cv_method = "{cv_method}",
colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"], colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"],
output: output:
"data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv" "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv"
log: log:

View File

@ -60,15 +60,15 @@ if not participant_info.empty:
0, "startlanguage" 0, "startlanguage"
] ]
if ( if (
("demand" in requested_features) ("limesurvey_demand" in requested_features)
or ("control" in requested_features) or ("limesurvey_control" in requested_features)
or ("demand_control_ratio" in requested_features) or ("limesurvey_demand_control_ratio" in requested_features)
): ):
participant_info_t = participant_info.T participant_info_t = participant_info.T
rows_baseline = participant_info_t.index rows_baseline = participant_info_t.index
if ("demand" in requested_features) or ( if ("limesurvey_demand" in requested_features) or (
"demand_control_ratio" in requested_features "limesurvey_demand_control_ratio" in requested_features
): ):
# Find questions about demand, but disregard time (duration of filling in questionnaire) # Find questions about demand, but disregard time (duration of filling in questionnaire)
rows_demand = rows_baseline.str.startswith( rows_demand = rows_baseline.str.startswith(
@ -96,12 +96,12 @@ if not participant_info.empty:
) )
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True) baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
if "demand" in requested_features: if "demand" in requested_features:
baseline_features.loc[0, "demand"] = limesurvey_demand[ baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
"score" "score"
].sum() ].sum()
if ("control" in requested_features) or ( if ("limesurvey_control" in requested_features) or (
"demand_control_ratio" in requested_features "limesurvey_demand_control_ratio" in requested_features
): ):
# Find questions about control, but disregard time (duration of filling in questionnaire) # Find questions about control, but disregard time (duration of filling in questionnaire)
rows_control = rows_baseline.str.startswith( rows_control = rows_baseline.str.startswith(
@ -130,12 +130,12 @@ if not participant_info.empty:
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True) baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
if "control" in requested_features: if "limesurvey_control" in requested_features:
baseline_features.loc[0, "control"] = limesurvey_control[ baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
"score" "score"
].sum() ].sum()
if "demand_control_ratio" in requested_features: if "limesurvey_demand_control_ratio" in requested_features:
limesurvey_demand_control_ratio = ( limesurvey_demand_control_ratio = (
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum() limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
) )
@ -167,10 +167,10 @@ if not participant_info.empty:
limesurvey_quartile = np.nan limesurvey_quartile = np.nan
baseline_features.loc[ baseline_features.loc[
0, "demand_control_ratio" 0, "limesurvey_demand_control_ratio"
] = limesurvey_demand_control_ratio ] = limesurvey_demand_control_ratio
baseline_features.loc[ baseline_features.loc[
0, "demand_control_ratio_quartile" 0, "limesurvey_demand_control_ratio_quartile"
] = limesurvey_quartile ] = limesurvey_quartile
if not baseline_interim.empty: if not baseline_interim.empty: