Clean features across participants.

Explore the best linear regression feature.
rapids
junos 2022-01-19 13:41:09 +01:00
parent 832eb6137e
commit e177b15058
1 changed files with 75 additions and 0 deletions

View File

@ -21,12 +21,14 @@ import os
import sys import sys
import numpy as np import numpy as np
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import yaml import yaml
from pyprojroot import here from pyprojroot import here
from sklearn import linear_model from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
@ -378,3 +380,76 @@ sns.heatmap(features_labels[feature_columns].isna(), cbar=False)
# ``` # ```
# %% # %%
features_rapids_cleaned = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"])
features_rapids_cleaned = features_rapids_cleaned.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date)
features_rapids_cleaned["participant_id"] = features_rapids_cleaned["pid"].str.extract("(\d+)")
features_rapids_cleaned["participant_id"] = pd.to_numeric(features_rapids_cleaned["participant_id"])
features_rapids_cleaned.set_index(["participant_id", "date_lj"], inplace=True)
# %%
features_cleaned_labels = features_rapids_cleaned.join(labels_read, how="inner").reset_index()
feature_clean_columns = features_cleaned_labels.columns[6:-3]
# %%
print(feature_columns.shape)
print(feature_clean_columns.shape)
# %%
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features_cleaned_labels[feature_clean_columns].isna(), cbar=False)
# %%
lin_reg_rapids_clean = linear_model.LinearRegression()
logo = LeaveOneGroupOut()
logo.get_n_splits(
features_cleaned_labels[feature_clean_columns],
features_cleaned_labels[label_column],
groups=features_cleaned_labels[group_column],
)
# %%
features_clean_imputed = imputer.fit_transform(features_cleaned_labels[feature_clean_columns])
# %%
cross_val_score(
lin_reg_rapids_clean,
X=features_clean_imputed,
y=features_cleaned_labels[label_column],
groups=features_cleaned_labels[group_column],
cv=logo,
n_jobs=-1,
scoring="r2",
)
# %%
lin_reg_full = linear_model.LinearRegression()
lin_reg_full.fit(features_clean_imputed,features_cleaned_labels[label_column])
# %%
NA_pred = lin_reg_full.predict(features_clean_imputed)
# %%
# The coefficients
print("Coefficients: \n", lin_reg_full.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(features_cleaned_labels[label_column], NA_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(features_cleaned_labels[label_column], NA_pred))
# %%
feature_clean_columns[np.argmax(lin_reg_full.coef_)]
# %% [markdown]
# Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine.
# %%
plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], features_cleaned_labels[label_column], color="black")
plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], NA_pred, color="red", linewidth=3)
plt.xticks()
plt.yticks()
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()