Doryab bluetooth and locations features fill in NaN values.

notes
Primoz 2022-09-15 10:48:59 +00:00
parent 3cf7ca41aa
commit 18002f59e1
3 changed files with 30 additions and 9 deletions

View File

@ -15,8 +15,8 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat
if "meanscans" in features_to_compute: if "meanscans" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer") features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
if "stdscans" in features_to_compute: if "stdscans" in features_to_compute:
# TODO: std scans # TODO: check if std scans implementation works
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer") features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership).fillna(0), how="outer")
# Most frequent device within segments, across segments, and across dataset # Most frequent device within segments, across segments, and across dataset
if "countscansmostfrequentdevicewithinsegments" in features_to_compute: if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer") features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer")

View File

@ -36,9 +36,10 @@ def variance_and_logvariance_features(location_data, location_features):
location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60 location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60 location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
# TODO: location variance
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0) location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan)
# TODO: loglocationvariance - check if the implementation works
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, -1000000)
return location_features return location_features
@ -113,8 +114,6 @@ def location_entropy(location_data):
entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"}) entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"})
entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique() entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique()
# TODO: normalizedlocationentropy
entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"] entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"]
return entropy return entropy
@ -156,7 +155,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
# distance and speed features # distance and speed features
moving_data = location_data[location_data["is_stationary"] == 0].copy() moving_data = location_data[location_data["is_stationary"] == 0].copy()
location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True) location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True)
# TODO: zakaj se ne zapolni varspeed z 0?
location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0) location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0)
# stationary features # stationary features

View File

@ -13,9 +13,18 @@ df_bt = df[doryab_cols_bt]
print(len(doryab_cols_bt)) print(len(doryab_cols_bt))
print(df_bt) print(df_bt)
sns.heatmap(df_bt, xticklabels=1) df_bt = df_bt.dropna(axis=0, how="all")
sns.heatmap(df_bt.isna(), xticklabels=1)
plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight') plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight')
df_q = pd.DataFrame()
for col in df_bt:
df_q[col] = pd.to_numeric(pd.cut(df_bt[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
sns.heatmap(df_q, cbar=False, xticklabels=1)
plt.savefig(f'cut_bluetooth_doryab_values', bbox_inches='tight')
plt.close()
# Location # Location
doryab_cols_loc = [col for col in df.columns if "locations_doryab" in col] doryab_cols_loc = [col for col in df.columns if "locations_doryab" in col]
df_loc = df[doryab_cols_loc] df_loc = df[doryab_cols_loc]
@ -23,6 +32,20 @@ df_loc = df[doryab_cols_loc]
print(len(doryab_cols_loc)) print(len(doryab_cols_loc))
print(df_loc) print(df_loc)
sns.heatmap(df_loc, xticklabels=1) df_loc = df_loc.dropna(axis=0, how="all").reset_index(drop=True)
print(df_loc)
sns.heatmap(df_loc.isna())
plt.savefig(f'locations_doryab_values', bbox_inches='tight') plt.savefig(f'locations_doryab_values', bbox_inches='tight')
df_q = pd.DataFrame()
for col in df_loc:
df_q[col] = pd.to_numeric(pd.cut(df_loc[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
sns.heatmap(df_q, cbar=False, xticklabels=1)
plt.savefig(f'cut_location_doryab_values', bbox_inches='tight')
plt.close()
plt.plot(df_loc['phone_locations_doryab_loglocationvariance'])
plt.savefig(f'phone_locations_doryab_loglocationvariance', bbox_inches='tight')
plt.close()