Fixed the features dependent on time duration.

2021-01-18 17:41:41 -05:00 · 2021-01-18 17:41:41 -05:00 · 0bbf15f52e
parent 4746d7ab6c
commit 0bbf15f52e
1 changed files with 42 additions and 89 deletions
--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@ -45,7 +45,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
        if cluster_on == "PARTICIPANT_DATASET":
            location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
            location_data = filter_data_by_segment(location_data, time_segment)
-
        elif cluster_on == "TIME_SEGMENT":
            location_data = filter_data_by_segment(location_data, time_segment)
            location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
@ -73,6 +72,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
                location_features = location_features.reset_index(drop=True)
                return location_features

+            location_data['timeInSeconds'] = (location_data.timestamp.diff(-1)* -1)/1000    
            if "locationvariance" in features_to_compute:
                location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
            
@ -118,49 +118,52 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
                for localDate in stationaryLocations['local_segment'].unique():
                    location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)

+            preComputedTimeArray = pd.DataFrame()
+            for localDate in stationaryLocations["local_segment"].unique():
+                top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate])
+                preComputedTimeArray.loc[localDate,"timeattop1"] = top1
+                preComputedTimeArray.loc[localDate,"timeattop2"] = top2
+                preComputedTimeArray.loc[localDate,"timeattop3"] = top3
+                preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"] = smax
+                preComputedTimeArray.loc[localDate,"minlengthstayatclusters"] = smin
+                preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"] = sstd
+                preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"] = smean
+
            if "timeattop1location" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"timeattop1"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],1,sampling_frequency)
+                    location_features.loc[localDate,"timeattop1"] = preComputedTimeArray.loc[localDate,"timeattop1"]

            if "timeattop2location" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"timeattop2"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],2,sampling_frequency)
+                    location_features.loc[localDate,"timeattop2"] = preComputedTimeArray.loc[localDate,"timeattop2"]
            
            if "timeattop3location" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"timeattop3"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],3,sampling_frequency)
+                    location_features.loc[localDate,"timeattop3"] = preComputedTimeArray.loc[localDate,"timeattop3"]

            if "movingtostaticratio" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"movingtostaticratio"] =  (stationaryLocations[stationaryLocations['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
+                    location_features.loc[localDate,"movingtostaticratio"] =  (stationaryLocations[stationaryLocations['local_segment']==localDate]['timeInSeconds'].sum()) / (location_data[location_data['local_segment']==localDate]['timeInSeconds'].sum())

            if "outlierstimepercent" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"outlierstimepercent"] = outliers_time_percent(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
-
-            preComputedmaxminCluster = pd.DataFrame()
-            for localDate in stationaryLocations['local_segment'].unique():
-                    smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
-                    preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] = smax 
-                    preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] = smin 
-                    preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] = sstd
-                    preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] = smean
+                    location_features.loc[localDate,"outlierstimepercent"] = outlier_time_percent_new(stationaryLocations[stationaryLocations['local_segment']==localDate])
            
            if "maxlengthstayatclusters" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"]
+                    location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"]
            
            if "minlengthstayatclusters" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"]
+                    location_features.loc[localDate,"minlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"minlengthstayatclusters"]

            if "stdlengthstayatclusters" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"]
+                    location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"]

            if "meanlengthstayatclusters" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"]
+                    location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"]

            if "locationentropy" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
@ -174,6 +177,21 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se

    return location_features

+def len_stay_timeattopn(locationData):
+    if locationData is None or len(locationData) == 0:
+        return  (None, None, None,None, None, None, None)
+
+    calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']]
+    calculationDf[calculationDf['timeInSeconds'] >= 300]['timeInSeconds'] = 60
+    timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
+    
+    if len(timeArray) > 2:
+        return (timeArray[0],timeArray[1],timeArray[2],timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
+    elif len(timeArray)==2:
+        return (timeArray[0],timeArray[1],None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
+    else:
+        return (timeArray[0],None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
+    

 def getMinutesData(locationData):

@ -188,7 +206,6 @@ def distance_to_degrees(d):

 def get_all_travel_distances_meters_speed(locationData,threshold,maximum_gap_allowed):
    
-    locationData['timeInSeconds'] = (locationData.timestamp.diff(-1)* -1)/1000
    lat_lon_temp = locationData[locationData['timeInSeconds'] <= maximum_gap_allowed][['double_latitude','double_longitude','timeInSeconds']]
    
    if lat_lon_temp.empty:
@ -314,7 +331,6 @@ def rank_count_map(clusters):
    labels, counts = tuple(np.unique(clusters, return_counts = True))
    sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
    label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
-
    return lambda x: label_to_rank.get(x, -1)


@ -367,78 +383,15 @@ def radius_of_gyration(locationData,sampling_frequency):

    return np.sqrt(final_rog)

-def time_at_topn_clusters_in_group(locationData,n,sampling_frequency):  # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
-    
-    if locationData is None or len(locationData) == 0:
+def outlier_time_percent_new(locationData):
+    if locationData is None or len(locationData)==0:
        return None

-    locationData = locationData[locationData["location_label"] >= 1]  # remove outliers/ cluster noise
-    valcounts = locationData["location_label"].value_counts().to_dict()
-    sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0]))
-
-    if len(sorted_valcounts) >= n:
-        topn = sorted_valcounts[n-1]
-        topn_time = topn[1] * sampling_frequency
-    else:
-        topn_time = None
-
-    return topn_time
-
-def outliers_time_percent(locationData,sampling_frequency):
-    if locationData is None or len(locationData) == 0:
-        return None
-    clusters = locationData["location_label"]
-    numoutliers = clusters[(clusters == -1)].sum() * sampling_frequency
-    numtotal = len(clusters) * sampling_frequency
-    return (float(-1*numoutliers) / numtotal)
-
-
-def moving_time_percent(locationData):
-    if locationData is None or len(locationData) == 0:
-        return None
-    lbls = locationData["location_label"]
-    nummoving = lbls.isnull().sum()
-    numtotal = len(lbls)
-    
-    return (float(nummoving) / numtotal)
-
-def len_stay_at_clusters_in_minutes(locationData,sampling_frequency):
-    if locationData is None or len(locationData) == 0:
-        return  (None, None, None,None)
-
-    lenstays = []
-    count = 0
-    prev_loc_label = None
-    for row in locationData.iterrows():
-        cur_loc_label = row[1]["location_label"]
-        if np.isnan(cur_loc_label):
-            continue
-        elif prev_loc_label == None:
-            prev_loc_label = int(cur_loc_label)
-            count += 1
-        else:
-            if prev_loc_label == int(cur_loc_label):
-                count += 1
-            else:
-                lenstays.append(count)
-                prev_loc_label = int(cur_loc_label)
-                count = 0 + 1
-    if count > 0:  # in case of no transition
-        lenstays.append(count)
-    lenstays = np.array(lenstays) * sampling_frequency
-
-    if len(lenstays) > 0:
-        smax = np.max(lenstays)
-        smin = np.min(lenstays)
-        sstd = np.std(lenstays)
-        smean = np.mean(lenstays)
-    else:
-        smax = None
-        smin = None
-        sstd = None
-        smean = None
-    return (smax, smin, sstd, smean)
+    clustersDf = locationData[["location_label","timeInSeconds"]]
+    numoutliers = clustersDf[clustersDf["location_label"]== -1]["timeInSeconds"].sum()
+    numtotal = clustersDf.timeInSeconds.sum()

+    return numoutliers/numtotal

 def location_entropy(locationData):
    if locationData is None or len(locationData) == 0: