Cluster by demand_control_ratio_quartile.
parent
2bb95657d8
commit
1516d1c000
|
@ -12,3 +12,6 @@ __pycache__/
|
||||||
/data/*input*.csv
|
/data/*input*.csv
|
||||||
/data/daily*
|
/data/daily*
|
||||||
/data/intradaily*
|
/data/intradaily*
|
||||||
|
/data/30min*
|
||||||
|
/presentation/*scores.csv
|
||||||
|
/presentation/Results.ods
|
||||||
|
|
|
@ -9,6 +9,7 @@ dependencies:
|
||||||
- flake8
|
- flake8
|
||||||
- jupyterlab
|
- jupyterlab
|
||||||
- jupytext
|
- jupytext
|
||||||
|
- lightgbm
|
||||||
- mypy
|
- mypy
|
||||||
- nodejs
|
- nodejs
|
||||||
- pandas
|
- pandas
|
||||||
|
|
|
@ -31,7 +31,6 @@ from sklearn.impute import SimpleImputer
|
||||||
|
|
||||||
from sklearn.dummy import DummyClassifier
|
from sklearn.dummy import DummyClassifier
|
||||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||||
from lightgbm import LGBMClassifier
|
|
||||||
import xgboost as xg
|
import xgboost as xg
|
||||||
|
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
|
@ -52,12 +51,12 @@ from machine_learning.classification_models import ClassificationModels
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
|
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
|
||||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||||
|
@ -66,7 +65,7 @@ model_input.columns[list(model_input.columns).index('age'):-1]
|
||||||
|
|
||||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||||
lime_cols
|
lime_cols
|
||||||
lime_col = 'limesurvey_demand_control_ratio'
|
lime_col = 'limesurvey_demand_control_ratio_quartile'
|
||||||
clust_col = lime_col
|
clust_col = lime_col
|
||||||
|
|
||||||
model_input[clust_col].describe()
|
model_input[clust_col].describe()
|
||||||
|
@ -75,9 +74,11 @@ model_input[clust_col].describe()
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
|
||||||
# Filter-out outlier rows by clust_col
|
# Filter-out outlier rows by clust_col
|
||||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||||
|
|
||||||
|
#print(model_input)
|
||||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||||
|
uniq = uniq.dropna()
|
||||||
plt.bar(uniq['pid'], uniq[clust_col])
|
plt.bar(uniq['pid'], uniq[clust_col])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
@ -182,3 +183,14 @@ for k in range(n_clusters):
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get overall results
|
# Get overall results
|
||||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
final_scores = pd.DataFrame()
|
||||||
|
for model in cmodels:
|
||||||
|
final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
final_scores
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue