Fixed the features dependent on time duration.

pull/114/head
nikunjgoel95 2021-01-18 17:41:41 -05:00
parent 4746d7ab6c
commit 0bbf15f52e
1 changed files with 42 additions and 89 deletions

View File

@ -45,7 +45,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
if cluster_on == "PARTICIPANT_DATASET":
location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
location_data = filter_data_by_segment(location_data, time_segment)
elif cluster_on == "TIME_SEGMENT":
location_data = filter_data_by_segment(location_data, time_segment)
location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
@ -73,6 +72,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
location_features = location_features.reset_index(drop=True)
return location_features
location_data['timeInSeconds'] = (location_data.timestamp.diff(-1)* -1)/1000
if "locationvariance" in features_to_compute:
location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
@ -118,49 +118,52 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
preComputedTimeArray = pd.DataFrame()
for localDate in stationaryLocations["local_segment"].unique():
top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate])
preComputedTimeArray.loc[localDate,"timeattop1"] = top1
preComputedTimeArray.loc[localDate,"timeattop2"] = top2
preComputedTimeArray.loc[localDate,"timeattop3"] = top3
preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"] = smax
preComputedTimeArray.loc[localDate,"minlengthstayatclusters"] = smin
preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"] = sstd
preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"] = smean
if "timeattop1location" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"timeattop1"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],1,sampling_frequency)
location_features.loc[localDate,"timeattop1"] = preComputedTimeArray.loc[localDate,"timeattop1"]
if "timeattop2location" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"timeattop2"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],2,sampling_frequency)
location_features.loc[localDate,"timeattop2"] = preComputedTimeArray.loc[localDate,"timeattop2"]
if "timeattop3location" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"timeattop3"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],3,sampling_frequency)
location_features.loc[localDate,"timeattop3"] = preComputedTimeArray.loc[localDate,"timeattop3"]
if "movingtostaticratio" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"movingtostaticratio"] = (stationaryLocations[stationaryLocations['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
location_features.loc[localDate,"movingtostaticratio"] = (stationaryLocations[stationaryLocations['local_segment']==localDate]['timeInSeconds'].sum()) / (location_data[location_data['local_segment']==localDate]['timeInSeconds'].sum())
if "outlierstimepercent" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"outlierstimepercent"] = outliers_time_percent(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster = pd.DataFrame()
for localDate in stationaryLocations['local_segment'].unique():
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] = smax
preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] = smin
preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] = sstd
preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] = smean
location_features.loc[localDate,"outlierstimepercent"] = outlier_time_percent_new(stationaryLocations[stationaryLocations['local_segment']==localDate])
if "maxlengthstayatclusters" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"]
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"]
if "minlengthstayatclusters" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"]
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"minlengthstayatclusters"]
if "stdlengthstayatclusters" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"]
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"]
if "meanlengthstayatclusters" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"]
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"]
if "locationentropy" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique():
@ -174,6 +177,21 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
return location_features
def len_stay_timeattopn(locationData):
if locationData is None or len(locationData) == 0:
return (None, None, None,None, None, None, None)
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']]
calculationDf[calculationDf['timeInSeconds'] >= 300]['timeInSeconds'] = 60
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
if len(timeArray) > 2:
return (timeArray[0],timeArray[1],timeArray[2],timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
elif len(timeArray)==2:
return (timeArray[0],timeArray[1],None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
else:
return (timeArray[0],None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
def getMinutesData(locationData):
@ -188,7 +206,6 @@ def distance_to_degrees(d):
def get_all_travel_distances_meters_speed(locationData,threshold,maximum_gap_allowed):
locationData['timeInSeconds'] = (locationData.timestamp.diff(-1)* -1)/1000
lat_lon_temp = locationData[locationData['timeInSeconds'] <= maximum_gap_allowed][['double_latitude','double_longitude','timeInSeconds']]
if lat_lon_temp.empty:
@ -314,7 +331,6 @@ def rank_count_map(clusters):
labels, counts = tuple(np.unique(clusters, return_counts = True))
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
return lambda x: label_to_rank.get(x, -1)
@ -367,78 +383,15 @@ def radius_of_gyration(locationData,sampling_frequency):
return np.sqrt(final_rog)
def time_at_topn_clusters_in_group(locationData,n,sampling_frequency): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
if locationData is None or len(locationData) == 0:
def outlier_time_percent_new(locationData):
if locationData is None or len(locationData)==0:
return None
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
valcounts = locationData["location_label"].value_counts().to_dict()
sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0]))
if len(sorted_valcounts) >= n:
topn = sorted_valcounts[n-1]
topn_time = topn[1] * sampling_frequency
else:
topn_time = None
return topn_time
def outliers_time_percent(locationData,sampling_frequency):
if locationData is None or len(locationData) == 0:
return None
clusters = locationData["location_label"]
numoutliers = clusters[(clusters == -1)].sum() * sampling_frequency
numtotal = len(clusters) * sampling_frequency
return (float(-1*numoutliers) / numtotal)
def moving_time_percent(locationData):
if locationData is None or len(locationData) == 0:
return None
lbls = locationData["location_label"]
nummoving = lbls.isnull().sum()
numtotal = len(lbls)
return (float(nummoving) / numtotal)
def len_stay_at_clusters_in_minutes(locationData,sampling_frequency):
if locationData is None or len(locationData) == 0:
return (None, None, None,None)
lenstays = []
count = 0
prev_loc_label = None
for row in locationData.iterrows():
cur_loc_label = row[1]["location_label"]
if np.isnan(cur_loc_label):
continue
elif prev_loc_label == None:
prev_loc_label = int(cur_loc_label)
count += 1
else:
if prev_loc_label == int(cur_loc_label):
count += 1
else:
lenstays.append(count)
prev_loc_label = int(cur_loc_label)
count = 0 + 1
if count > 0: # in case of no transition
lenstays.append(count)
lenstays = np.array(lenstays) * sampling_frequency
if len(lenstays) > 0:
smax = np.max(lenstays)
smin = np.min(lenstays)
sstd = np.std(lenstays)
smean = np.mean(lenstays)
else:
smax = None
smin = None
sstd = None
smean = None
return (smax, smin, sstd, smean)
clustersDf = locationData[["location_label","timeInSeconds"]]
numoutliers = clustersDf[clustersDf["location_label"]== -1]["timeInSeconds"].sum()
numtotal = clustersDf.timeInSeconds.sum()
return numoutliers/numtotal
def location_entropy(locationData):
if locationData is None or len(locationData) == 0: