From 18002f59e1c0e161f68b2a4cc217e62a6a86e467 Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 15 Sep 2022 10:48:59 +0000 Subject: [PATCH] Doryab bluetooth and locations features fill in NaN values. --- src/features/phone_bluetooth/doryab/main.py | 4 +-- src/features/phone_locations/doryab/main.py | 8 +++--- tests/scripts/doryab_values.py | 27 +++++++++++++++++++-- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/features/phone_bluetooth/doryab/main.py b/src/features/phone_bluetooth/doryab/main.py index cb272ebc..f83fc457 100644 --- a/src/features/phone_bluetooth/doryab/main.py +++ b/src/features/phone_bluetooth/doryab/main.py @@ -15,8 +15,8 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat if "meanscans" in features_to_compute: features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer") if "stdscans" in features_to_compute: - # TODO: std scans - features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer") + # TODO: check if std scans implementation works + features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership).fillna(0), how="outer") # Most frequent device within segments, across segments, and across dataset if "countscansmostfrequentdevicewithinsegments" in features_to_compute: features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer") diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 36969032..6be0d1ad 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -36,9 +36,10 @@ def variance_and_logvariance_features(location_data, location_features): location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60 location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60 - # TODO: location variance location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0) - location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan) + + # TODO: loglocationvariance - check if the implementation works + location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, -1000000) return location_features @@ -113,8 +114,6 @@ def location_entropy(location_data): entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"}) entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique() - - # TODO: normalizedlocationentropy entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"] return entropy @@ -156,7 +155,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se # distance and speed features moving_data = location_data[location_data["is_stationary"] == 0].copy() location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True) - # TODO: zakaj se ne zapolni varspeed z 0? location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0) # stationary features diff --git a/tests/scripts/doryab_values.py b/tests/scripts/doryab_values.py index 2f171999..d51ceaa0 100644 --- a/tests/scripts/doryab_values.py +++ b/tests/scripts/doryab_values.py @@ -13,9 +13,18 @@ df_bt = df[doryab_cols_bt] print(len(doryab_cols_bt)) print(df_bt) -sns.heatmap(df_bt, xticklabels=1) +df_bt = df_bt.dropna(axis=0, how="all") +sns.heatmap(df_bt.isna(), xticklabels=1) plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight') +df_q = pd.DataFrame() +for col in df_bt: + df_q[col] = pd.to_numeric(pd.cut(df_bt[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False)) + +sns.heatmap(df_q, cbar=False, xticklabels=1) +plt.savefig(f'cut_bluetooth_doryab_values', bbox_inches='tight') +plt.close() + # Location doryab_cols_loc = [col for col in df.columns if "locations_doryab" in col] df_loc = df[doryab_cols_loc] @@ -23,6 +32,20 @@ df_loc = df[doryab_cols_loc] print(len(doryab_cols_loc)) print(df_loc) -sns.heatmap(df_loc, xticklabels=1) +df_loc = df_loc.dropna(axis=0, how="all").reset_index(drop=True) +print(df_loc) +sns.heatmap(df_loc.isna()) plt.savefig(f'locations_doryab_values', bbox_inches='tight') +df_q = pd.DataFrame() +for col in df_loc: + df_q[col] = pd.to_numeric(pd.cut(df_loc[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False)) + +sns.heatmap(df_q, cbar=False, xticklabels=1) +plt.savefig(f'cut_location_doryab_values', bbox_inches='tight') +plt.close() + +plt.plot(df_loc['phone_locations_doryab_loglocationvariance']) +plt.savefig(f'phone_locations_doryab_loglocationvariance', bbox_inches='tight') +plt.close() +