rapids/calculatingfeatures/eda_explorer/load_files.py

import pandas as pd
import scipy.signal as scisig
import os
import numpy as np


def get_user_input(prompt):
    try:
        return raw_input(prompt)
    except NameError:
        return input(prompt)


def getInputLoadFile():
    '''Asks user for type of file and file path. Loads corresponding data.

    OUTPUT:
        data:   DataFrame, index is a list of timestamps at 8Hz, columns include 
                AccelZ, AccelY, AccelX, Temp, EDA, filtered_eda
    '''
    print("Please enter information about your EDA file... ")
    dataType = get_user_input("\tData Type (e4, q, shimmer, or misc): ")
    if dataType=='q':
        filepath = get_user_input("\tFile path: ")
        filepath_confirm = filepath
        data = loadData_Qsensor(filepath)
    elif dataType=='e4':
        filepath = get_user_input("\tPath to E4 directory: ")
        filepath_confirm = os.path.join(filepath,"EDA.csv")
        data = loadData_E4(filepath)
    elif dataType=='shimmer':
        filepath = get_user_input("\tFile path: ")
        filepath_confirm = filepath
        data = loadData_shimmer(filepath)
    elif dataType=="misc":
        filepath = get_user_input("\tFile path: ")
        filepath_confirm = filepath
        data = loadData_misc(filepath)
    else:
        print("Error: not a valid file choice")

    return data, filepath_confirm

def getOutputPath():
    print("")
    print("Where would you like to save the computed output file?")
    outfile = get_user_input('\tFile name: ')
    outputPath = get_user_input('\tFile directory (./ for this directory): ')
    fullOutputPath = os.path.join(outputPath,outfile)
    if fullOutputPath[-4:] != '.csv':
        fullOutputPath = fullOutputPath+'.csv'
    return fullOutputPath

def loadData_Qsensor(filepath):
    '''
    This function loads the Q sensor data, uses a lowpass butterworth filter on the EDA signal
    Note: currently assumes sampling rate of 8hz, 16hz, 32hz; if sampling rate is 16hz or 32hz the signal is downsampled

    INPUT:
        filepath:       string, path to input file

    OUTPUT:
        data:           DataFrame, index is a list of timestamps at 8Hz, columns include AccelZ, AccelY, AccelX, Temp, EDA, filtered_eda
    '''
    # Get header info
    try:
        header_info = pd.io.parsers.read_csv(filepath, nrows=5)
    except IOError:
        print("Error!! Couldn't load file, make sure the filepath is correct and you are using a csv from the q sensor software\n\n")
        return

    # Get sample rate
    sampleRate = int((header_info.iloc[3,0]).split(":")[1].strip())

    # Get the raw data
    data = pd.io.parsers.read_csv(filepath, skiprows=7)
    data = data.reset_index()

    # Reset the index to be a time and reset the column headers
    data.columns = ['AccelZ','AccelY','AccelX','Battery','Temp','EDA']

    # Get Start Time
    startTime = pd.to_datetime(header_info.iloc[4,0][12:-10])
    
    # Make sure data has a sample rate of 8Hz
    data = interpolateDataTo8Hz(data,sampleRate,startTime)
    
    # Remove Battery Column
    data = data[['AccelZ','AccelY','AccelX','Temp','EDA']]

    # Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
    data['filtered_eda'] =  butter_lowpass_filter(data['EDA'], 1.0, 8, 6)

    return data

def _loadSingleFile_E4(filepath,list_of_columns, expected_sample_rate,freq):
    # Load data
    data = pd.read_csv(filepath)
    
    # Get the startTime and sample rate
    startTime = pd.to_datetime(float(data.columns.values[0]),unit="s")
    sampleRate = float(data.iloc[0][0])
    data = data[data.index!=0]
    data.index = data.index-1
    
    # Reset the data frame assuming expected_sample_rate
    data.columns = list_of_columns
    if sampleRate != expected_sample_rate:
        print('ERROR, NOT SAMPLED AT {0}HZ. PROBLEMS WILL OCCUR\n'.format(expected_sample_rate))

    # Make sure data has a sample rate of 8Hz
    data = interpolateDataTo8Hz(data,sampleRate,startTime)

    return data


def loadData_E4(filepath):
    # Load EDA data
    eda_data = _loadSingleFile_E4(os.path.join(filepath,'EDA.csv'),["EDA"],4,"250L")
    # Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
    eda_data['filtered_eda'] =  butter_lowpass_filter(eda_data['EDA'], 1.0, 8, 6)

    # Load ACC data
    acc_data = _loadSingleFile_E4(os.path.join(filepath,'ACC.csv'),["AccelX","AccelY","AccelZ"],32,"31250U")
    # Scale the accelometer to +-2g
    acc_data[["AccelX","AccelY","AccelZ"]] = acc_data[["AccelX","AccelY","AccelZ"]]/64.0

    # Load Temperature data
    temperature_data = _loadSingleFile_E4(os.path.join(filepath,'TEMP.csv'),["Temp"],4,"250L")
    
    data = eda_data.join(acc_data, how='outer')
    data = data.join(temperature_data, how='outer')

    # E4 sometimes records different length files - adjust as necessary
    min_length = min(len(acc_data), len(eda_data), len(temperature_data))

    return data[:min_length]

def loadData_shimmer(filepath):
    data = pd.read_csv(filepath, sep='\t', skiprows=(0,1))

    orig_cols = data.columns
    rename_cols = {}

    for search, new_col in [['Timestamp','Timestamp'],
                            ['Accel_LN_X', 'AccelX'], ['Accel_LN_Y', 'AccelY'], ['Accel_LN_Z', 'AccelZ'],
                            ['Skin_Conductance', 'EDA']]:
        orig = [c for c in orig_cols if search in c]
        if len(orig) == 0:
            continue
        rename_cols[orig[0]] = new_col

    data.rename(columns=rename_cols, inplace=True)

    # TODO: Assuming no temperature is recorded
    data['Temp'] = 0

    # Drop the units row and unnecessary columns
    data = data[data['Timestamp'] != 'ms']
    data.index = pd.to_datetime(data['Timestamp'], unit='ms')
    data = data[['AccelZ', 'AccelY', 'AccelX', 'Temp', 'EDA']]

    for c in ['AccelZ', 'AccelY', 'AccelX', 'Temp', 'EDA']:
        data[c] = pd.to_numeric(data[c])

    # Convert to 8Hz
    data = data.resample("125L").mean()
    data.interpolate(inplace=True)

    # Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
    data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)

    return data


def loadData_getColNames(data_columns):
    print("Here are the data columns of your file: ")
    print(data_columns)

    # Find the column names for each of the 5 data streams
    colnames = ['EDA data','Temperature data','Acceleration X','Acceleration Y','Acceleration Z']
    new_colnames = ['','','','','']

    for i in range(len(new_colnames)):
        new_colnames[i] = get_user_input("Column name that contains "+colnames[i]+": ")
        while (new_colnames[i] not in data_columns):
            print("Column not found. Please try again")
            print("Here are the data columns of your file: ")
            print(data_columns)

            new_colnames[i] = get_user_input("Column name that contains "+colnames[i]+": ")

    # Get user input on sample rate
    sampleRate = get_user_input("Enter sample rate (must be an integer power of 2): ")
    while (sampleRate.isdigit()==False) or (np.log(int(sampleRate))/np.log(2) != np.floor(np.log(int(sampleRate))/np.log(2))):
        print("Not an integer power of two")
        sampleRate = get_user_input("Enter sample rate (must be a integer power of 2): ")
    sampleRate = int(sampleRate)

    # Get user input on start time
    startTime = pd.to_datetime(get_user_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))
    while type(startTime)==str:
        print("Not a valid date/time")
        startTime = pd.to_datetime(get_user_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))


    return sampleRate, startTime, new_colnames


def loadData_misc(filepath):
    # Load data
    data = pd.read_csv(filepath)

    # Get the correct colnames
    sampleRate, startTime, new_colnames = loadData_getColNames(data.columns.values)

    data.rename(columns=dict(zip(new_colnames,['EDA','Temp','AccelX','AccelY','AccelZ'])), inplace=True)
    data = data[['AccelZ','AccelY','AccelX','Temp','EDA']]

    # Make sure data has a sample rate of 8Hz
    data = interpolateDataTo8Hz(data,sampleRate,startTime)

    # Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
    data['filtered_eda'] =  butter_lowpass_filter(data['EDA'], 1.0, 8, 6)

    return data

def interpolateDataTo8Hz(data,sample_rate,startTime):
    if sample_rate<8:
        # Upsample by linear interpolation
        if sample_rate==2:
            data.index = pd.date_range(start=startTime, periods=len(data), freq='500L')
        elif sample_rate==4:
            data.index = pd.date_range(start=startTime, periods=len(data), freq='250L')
        data = data.resample("125L").mean()
    else:
        if sample_rate>8:
            # Downsample
            idx_range = list(range(0,len(data))) # TODO: double check this one
            data = data.iloc[idx_range[0::int(int(sample_rate)/8)]]
        # Set the index to be 8Hz
        data.index = pd.date_range(start=startTime, periods=len(data), freq='125L')

    # Interpolate all empty values
    data = interpolateEmptyValues(data)
    return data

def interpolateEmptyValues(data):
    cols = data.columns.values
    for c in cols:
        data.loc[:, c] = data[c].interpolate()

    return data

def butter_lowpass(cutoff, fs, order=5):
    # Filtering Helper functions
    sos = scisig.butter(order, cutoff, btype='low', analog=False, output='sos', fs=fs)
    return sos

def butter_lowpass_filter(data, cutoff, fs, order=5):
    # Filtering Helper functions
    sos = butter_lowpass(cutoff, fs, order=order)
    y = scisig.sosfilt(sos, data)
    return y
Build calc features lib and related packages. 2022-03-21 09:28:28 +01:00			`import pandas as pd`
			`import scipy.signal as scisig`
			`import os`
			`import numpy as np`


			`def get_user_input(prompt):`
			`try:`
			`return raw_input(prompt)`
			`except NameError:`
			`return input(prompt)`


			`def getInputLoadFile():`
			`'''Asks user for type of file and file path. Loads corresponding data.`

			`OUTPUT:`
			`data: DataFrame, index is a list of timestamps at 8Hz, columns include`
			`AccelZ, AccelY, AccelX, Temp, EDA, filtered_eda`
			`'''`
			`print("Please enter information about your EDA file... ")`
			`dataType = get_user_input("\tData Type (e4, q, shimmer, or misc): ")`
			`if dataType=='q':`
			`filepath = get_user_input("\tFile path: ")`
			`filepath_confirm = filepath`
			`data = loadData_Qsensor(filepath)`
			`elif dataType=='e4':`
			`filepath = get_user_input("\tPath to E4 directory: ")`
			`filepath_confirm = os.path.join(filepath,"EDA.csv")`
			`data = loadData_E4(filepath)`
			`elif dataType=='shimmer':`
			`filepath = get_user_input("\tFile path: ")`
			`filepath_confirm = filepath`
			`data = loadData_shimmer(filepath)`
			`elif dataType=="misc":`
			`filepath = get_user_input("\tFile path: ")`
			`filepath_confirm = filepath`
			`data = loadData_misc(filepath)`
			`else:`
			`print("Error: not a valid file choice")`

			`return data, filepath_confirm`

			`def getOutputPath():`
			`print("")`
			`print("Where would you like to save the computed output file?")`
			`outfile = get_user_input('\tFile name: ')`
			`outputPath = get_user_input('\tFile directory (./ for this directory): ')`
			`fullOutputPath = os.path.join(outputPath,outfile)`
			`if fullOutputPath[-4:] != '.csv':`
			`fullOutputPath = fullOutputPath+'.csv'`
			`return fullOutputPath`

			`def loadData_Qsensor(filepath):`
			`'''`
			`This function loads the Q sensor data, uses a lowpass butterworth filter on the EDA signal`
			`Note: currently assumes sampling rate of 8hz, 16hz, 32hz; if sampling rate is 16hz or 32hz the signal is downsampled`

			`INPUT:`
			`filepath: string, path to input file`

			`OUTPUT:`
			`data: DataFrame, index is a list of timestamps at 8Hz, columns include AccelZ, AccelY, AccelX, Temp, EDA, filtered_eda`
			`'''`
			`# Get header info`
			`try:`
			`header_info = pd.io.parsers.read_csv(filepath, nrows=5)`
			`except IOError:`
			`print("Error!! Couldn't load file, make sure the filepath is correct and you are using a csv from the q sensor software\n\n")`
			`return`

			`# Get sample rate`
			`sampleRate = int((header_info.iloc[3,0]).split(":")[1].strip())`

			`# Get the raw data`
			`data = pd.io.parsers.read_csv(filepath, skiprows=7)`
			`data = data.reset_index()`

			`# Reset the index to be a time and reset the column headers`
			`data.columns = ['AccelZ','AccelY','AccelX','Battery','Temp','EDA']`

			`# Get Start Time`
			`startTime = pd.to_datetime(header_info.iloc[4,0][12:-10])`

			`# Make sure data has a sample rate of 8Hz`
			`data = interpolateDataTo8Hz(data,sampleRate,startTime)`

			`# Remove Battery Column`
			`data = data[['AccelZ','AccelY','AccelX','Temp','EDA']]`

			`# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)`
			`data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)`

			`return data`

			`def _loadSingleFile_E4(filepath,list_of_columns, expected_sample_rate,freq):`
			`# Load data`
			`data = pd.read_csv(filepath)`

			`# Get the startTime and sample rate`
			`startTime = pd.to_datetime(float(data.columns.values[0]),unit="s")`
			`sampleRate = float(data.iloc[0][0])`
			`data = data[data.index!=0]`
			`data.index = data.index-1`

			`# Reset the data frame assuming expected_sample_rate`
			`data.columns = list_of_columns`
			`if sampleRate != expected_sample_rate:`
			`print('ERROR, NOT SAMPLED AT {0}HZ. PROBLEMS WILL OCCUR\n'.format(expected_sample_rate))`

			`# Make sure data has a sample rate of 8Hz`
			`data = interpolateDataTo8Hz(data,sampleRate,startTime)`

			`return data`


			`def loadData_E4(filepath):`
			`# Load EDA data`
			`eda_data = _loadSingleFile_E4(os.path.join(filepath,'EDA.csv'),["EDA"],4,"250L")`
			`# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)`
			`eda_data['filtered_eda'] = butter_lowpass_filter(eda_data['EDA'], 1.0, 8, 6)`

			`# Load ACC data`
			`acc_data = _loadSingleFile_E4(os.path.join(filepath,'ACC.csv'),["AccelX","AccelY","AccelZ"],32,"31250U")`
			`# Scale the accelometer to +-2g`
			`acc_data[["AccelX","AccelY","AccelZ"]] = acc_data[["AccelX","AccelY","AccelZ"]]/64.0`

			`# Load Temperature data`
			`temperature_data = _loadSingleFile_E4(os.path.join(filepath,'TEMP.csv'),["Temp"],4,"250L")`

			`data = eda_data.join(acc_data, how='outer')`
			`data = data.join(temperature_data, how='outer')`

			`# E4 sometimes records different length files - adjust as necessary`
			`min_length = min(len(acc_data), len(eda_data), len(temperature_data))`

			`return data[:min_length]`

			`def loadData_shimmer(filepath):`
			`data = pd.read_csv(filepath, sep='\t', skiprows=(0,1))`

			`orig_cols = data.columns`
			`rename_cols = {}`

			`for search, new_col in [['Timestamp','Timestamp'],`
			`['Accel_LN_X', 'AccelX'], ['Accel_LN_Y', 'AccelY'], ['Accel_LN_Z', 'AccelZ'],`
			`['Skin_Conductance', 'EDA']]:`
			`orig = [c for c in orig_cols if search in c]`
			`if len(orig) == 0:`
			`continue`
			`rename_cols[orig[0]] = new_col`

			`data.rename(columns=rename_cols, inplace=True)`

			`# TODO: Assuming no temperature is recorded`
			`data['Temp'] = 0`

			`# Drop the units row and unnecessary columns`
			`data = data[data['Timestamp'] != 'ms']`
			`data.index = pd.to_datetime(data['Timestamp'], unit='ms')`
			`data = data[['AccelZ', 'AccelY', 'AccelX', 'Temp', 'EDA']]`

			`for c in ['AccelZ', 'AccelY', 'AccelX', 'Temp', 'EDA']:`
			`data[c] = pd.to_numeric(data[c])`

			`# Convert to 8Hz`
			`data = data.resample("125L").mean()`
			`data.interpolate(inplace=True)`

			`# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)`
			`data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)`

			`return data`


			`def loadData_getColNames(data_columns):`
			`print("Here are the data columns of your file: ")`
			`print(data_columns)`

			`# Find the column names for each of the 5 data streams`
			`colnames = ['EDA data','Temperature data','Acceleration X','Acceleration Y','Acceleration Z']`
			`new_colnames = ['','','','','']`

			`for i in range(len(new_colnames)):`
			`new_colnames[i] = get_user_input("Column name that contains "+colnames[i]+": ")`
			`while (new_colnames[i] not in data_columns):`
			`print("Column not found. Please try again")`
			`print("Here are the data columns of your file: ")`
			`print(data_columns)`

			`new_colnames[i] = get_user_input("Column name that contains "+colnames[i]+": ")`

			`# Get user input on sample rate`
			`sampleRate = get_user_input("Enter sample rate (must be an integer power of 2): ")`
			`while (sampleRate.isdigit()==False) or (np.log(int(sampleRate))/np.log(2) != np.floor(np.log(int(sampleRate))/np.log(2))):`
			`print("Not an integer power of two")`
			`sampleRate = get_user_input("Enter sample rate (must be a integer power of 2): ")`
			`sampleRate = int(sampleRate)`

			`# Get user input on start time`
			`startTime = pd.to_datetime(get_user_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))`
			`while type(startTime)==str:`
			`print("Not a valid date/time")`
			`startTime = pd.to_datetime(get_user_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))`


			`return sampleRate, startTime, new_colnames`


			`def loadData_misc(filepath):`
			`# Load data`
			`data = pd.read_csv(filepath)`

			`# Get the correct colnames`
			`sampleRate, startTime, new_colnames = loadData_getColNames(data.columns.values)`

			`data.rename(columns=dict(zip(new_colnames,['EDA','Temp','AccelX','AccelY','AccelZ'])), inplace=True)`
			`data = data[['AccelZ','AccelY','AccelX','Temp','EDA']]`

			`# Make sure data has a sample rate of 8Hz`
			`data = interpolateDataTo8Hz(data,sampleRate,startTime)`

			`# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)`
			`data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)`

			`return data`

			`def interpolateDataTo8Hz(data,sample_rate,startTime):`
			`if sample_rate<8:`
			`# Upsample by linear interpolation`
			`if sample_rate==2:`
			`data.index = pd.date_range(start=startTime, periods=len(data), freq='500L')`
			`elif sample_rate==4:`
			`data.index = pd.date_range(start=startTime, periods=len(data), freq='250L')`
			`data = data.resample("125L").mean()`
			`else:`
			`if sample_rate>8:`
			`# Downsample`
			`idx_range = list(range(0,len(data))) # TODO: double check this one`
			`data = data.iloc[idx_range[0::int(int(sample_rate)/8)]]`
			`# Set the index to be 8Hz`
			`data.index = pd.date_range(start=startTime, periods=len(data), freq='125L')`

			`# Interpolate all empty values`
			`data = interpolateEmptyValues(data)`
			`return data`

			`def interpolateEmptyValues(data):`
			`cols = data.columns.values`
			`for c in cols:`
			`data.loc[:, c] = data[c].interpolate()`

			`return data`

			`def butter_lowpass(cutoff, fs, order=5):`
			`# Filtering Helper functions`
			`sos = scisig.butter(order, cutoff, btype='low', analog=False, output='sos', fs=fs)`
			`return sos`

			`def butter_lowpass_filter(data, cutoff, fs, order=5):`
			`# Filtering Helper functions`
			`sos = butter_lowpass(cutoff, fs, order=order)`
			`y = scisig.sosfilt(sos, data)`
			`return y`