diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index 53607ad..3deae61 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -36,6 +36,8 @@ nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) +import machine_learning.helper + # %% [markdown] # # RAPIDS models @@ -70,14 +72,20 @@ model_input['target'].value_counts() # %% jupyter={"source_hidden": false, "outputs_hidden": false} # UnderSampling if undersampling: - model_input.groupby("pid").count() - no_stress = model_input[model_input['target'] == 0] - stress = model_input[model_input['target'] == 1] + model_input_new = pd.DataFrame(columns=model_input.columns) + for pid in model_input["pid"].unique(): + stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)] + no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)] + if (len(stress) == 0): + continue + if (len(no_stress) == 0): + continue + model_input_new = pd.concat([model_input_new, stress], axis=0) - no_stress = no_stress.sample(n=len(stress)) - model_input = pd.concat([stress,no_stress], axis=0) - - model_input["target"].value_counts() + no_stress = no_stress.sample(n=min(len(stress), len(no_stress))) + # In case there are more stress samples than no_stress, take all instances of no_stress. + model_input_new = pd.concat([model_input_new, no_stress], axis=0) + model_input = model_input_new # %% jupyter={"source_hidden": false, "outputs_hidden": false} @@ -152,6 +160,18 @@ print("F1", np.mean(dummy_classifier['test_f1'])) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl])) +# %% [markdown] nteract={"transient": {"deleting": false}} +# ### All models + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method) + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +# %% +final_scores.index.name = "metric" +final_scores = final_scores.set_index(["method", final_scores.index]) +final_scores.to_csv("../presentation/event_stressful_detection_5fold.csv") + # %% [markdown] # ### Logistic Regression