Merge branch 'feature/doryab_location_empty_df_fix' into develop

Adding the branch to fix infer home locations and empty dataframe.
2021-03-19 11:17:43 -04:00 · 2021-03-19 11:17:43 -04:00 · 7815c380a2
parent 294d84277d cfc5039918
commit 7815c380a2
2 changed files with 24 additions and 19 deletions
--- a/src/data/infer_home_location.py
+++ b/src/data/infer_home_location.py
@ -111,27 +111,30 @@ def haversine(lon1,lat1,lon2,lat2):

 origDf = pd.read_csv(snakemake.input[0])
 filteredDf = filterDatafromDf(origDf)
-dbscan_eps = snakemake.params["dbscan_eps"]
-dbscan_minsamples = snakemake.params["dbscan_minsamples"]
-threshold_static = snakemake.params["threshold_static"]
-clustering_algorithm = snakemake.params["clustering_algorithm"]
-
-if clustering_algorithm == "DBSCAN":
-    hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
-elif clustering_algorithm == "OPTICS":
-    hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} 
+if filteredDf.empty:
+    filteredDf.to_csv(snakemake.output[0])
 else:
-    raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
+    dbscan_eps = snakemake.params["dbscan_eps"]
+    dbscan_minsamples = snakemake.params["dbscan_minsamples"]
+    threshold_static = snakemake.params["threshold_static"]
+    clustering_algorithm = snakemake.params["clustering_algorithm"]

-filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters)
+    if clustering_algorithm == "DBSCAN":
+        hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
+    elif clustering_algorithm == "OPTICS":
+        hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} 
+    else:
+        raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)

-origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude']
-origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude']
+    filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters)

-distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude)
+    origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude']
+    origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude']

-finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1)
-finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome)
-finalDf.to_csv(snakemake.output[0], index=False)
+    distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude)
+
+    finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1)
+    finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome)
+    finalDf.to_csv(snakemake.output[0], index=False)


--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@ -188,12 +188,14 @@ def len_stay_timeattopn(locationData,maximum_gap_allowed,maximum_row_duration):
    calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
    timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
    
-    if len(timeArray) > 2:
+    if len(timeArray) == 3:
        return (timeArray[0],timeArray[1],timeArray[2],timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
    elif len(timeArray)==2:
        return (timeArray[0],timeArray[1],None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
-    else:
+    elif len(timeArray)==1:
        return (timeArray[0],None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
+    else:
+        return (None,None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
    

 def getMinutesData(locationData):