Use stratified downsampling.
And run all models with a method from machine_learning.helper.ml_pipeline
parent
b0b9edccc4
commit
72fdd9c5ec
|
@ -36,6 +36,8 @@ nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
|
import machine_learning.helper
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# # RAPIDS models
|
# # RAPIDS models
|
||||||
|
|
||||||
|
@ -70,14 +72,20 @@ model_input['target'].value_counts()
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
# UnderSampling
|
# UnderSampling
|
||||||
if undersampling:
|
if undersampling:
|
||||||
model_input.groupby("pid").count()
|
model_input_new = pd.DataFrame(columns=model_input.columns)
|
||||||
no_stress = model_input[model_input['target'] == 0]
|
for pid in model_input["pid"].unique():
|
||||||
stress = model_input[model_input['target'] == 1]
|
stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
||||||
|
no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
||||||
|
if (len(stress) == 0):
|
||||||
|
continue
|
||||||
|
if (len(no_stress) == 0):
|
||||||
|
continue
|
||||||
|
model_input_new = pd.concat([model_input_new, stress], axis=0)
|
||||||
|
|
||||||
no_stress = no_stress.sample(n=len(stress))
|
no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
||||||
model_input = pd.concat([stress,no_stress], axis=0)
|
# In case there are more stress samples than no_stress, take all instances of no_stress.
|
||||||
|
model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||||
model_input["target"].value_counts()
|
model_input = model_input_new
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
@ -152,6 +160,18 @@ print("F1", np.mean(dummy_classifier['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown] nteract={"transient": {"deleting": false}}
|
||||||
|
# ### All models
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
# %%
|
||||||
|
final_scores.index.name = "metric"
|
||||||
|
final_scores = final_scores.set_index(["method", final_scores.index])
|
||||||
|
final_scores.to_csv("../presentation/event_stressful_detection_5fold.csv")
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Logistic Regression
|
# ### Logistic Regression
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue