musi/utils.py

#Utility functions for working with 3D gaze data.
#Authors: Brendan David-John, Candace Peacock
import itertools
import numpy as np
import pandas as pd
import os
import sys
import math
import pandas as pd
import numpy as np
import json
import glob  # This was missing
from sklearn.preprocessing import scale
from sklearn.utils import resample


subjs_skip = []
discrete_feats = ['sac_det','fix_det']

#https://pubmed.ncbi.nlm.nih.gov/25713524/ & https://www.sciencedirect.com/science/article/abs/pii/0025556475900759
max_sacc_speed = 800

#https://www.geeksforgeeks.org/python-make-a-list-of-intervals-with-sequential-numbers/
def intervals_extract(iterable):
    iterable = sorted(set(iterable))
    for key, group in itertools.groupby(enumerate(iterable),
    lambda t: t[1] - t[0]):
        group = list(group)
        yield [group[0][1], group[-1][1]]

#find consecutive chunks of true values in bools, and return start and end indicies
#input:  1d np array
#output: 2d np array with indices
def intervals_from_bool(bools):
    #get python list of indicies
    bool_idx = np.where(bools == True)[0].tolist()

    #pull out the intervals
    intervals = intervals_extract(bool_idx)

    #convert to flat np array
    intervals = [item for sublist in intervals for item in sublist]

    #unroll it so start and end points fall into 1st/2nd column
    intervals = np.reshape(intervals, (-1, 2))

    return intervals

'''
Function to compute angular gaze velocity using the a numerically stable atan2 method
Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z),
        dt: 1D array with N elements containing timestep for each gaze sample
Output: 1D array with N elements containing angular gaze velocity in degrees
'''
def compute_gaze_velocity(gaze_vectors,dt):
    #get array of i+1 vectors
    gaze_vectors_1 = gaze_vectors[0:-1,:]
    gaze_vectors_2 = gaze_vectors[1:,:]

    #numerically stable method: atan2(norm(u-v),norm(u+v)) assuming u and v are already normalized
    gaze_dist = 2*np.arctan2(np.sqrt(np.square(gaze_vectors_1-gaze_vectors_2).sum(axis=1)),np.sqrt(np.square(gaze_vectors_1+gaze_vectors_2).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v))
    gaze_velocity = np.rad2deg(gaze_dist) / dt

    #make nans and infs zero
    np.nan_to_num(gaze_velocity,copy=False,posinf=0.0,neginf=0.0)

    #correct outliers over 800 deg/s by interpolating across good samples
    gaze_velocity_good_idx = (gaze_velocity <= max_sacc_speed).nonzero()[0]
    gaze_velocity = np.interp(range(gaze_velocity.shape[0]),gaze_velocity_good_idx,gaze_velocity[gaze_velocity_good_idx])

    #append zero for first element, so size matches input
    return np.insert(gaze_velocity,0,0.0)

'''
Function to compute angular gaze velocity using the arccos dot method
Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z),
        dt :1D array with N elements containing timestep for each gaze sample
Output: 1D array with N elements containing angular gaze velocity in degrees
'''
def compute_gaze_velocity_naive(gaze_vectors,dt):
    #get array of i+1 vectors
    gaze_vectors_1 = gaze_vectors[0:-1,:]
    gaze_vectors_2 = gaze_vectors[1:,:]

    #arc cos of dot product method, assuming vectors are normalized
    gaze_dist_naive = np.arccos(np.sum(gaze_vectors_1*gaze_vectors_2, axis=1))
    gaze_velocity_naive = np.rad2deg(gaze_dist_naive) / dt

    #make nans and infs zero
    np.nan_to_num(gaze_velocity_naive,copy=False,posinf=0.0,neginf=0.0)

    #correct outliers over 800 deg/s by interpolating across good samples
    gaze_velocity_naive_good_idx = (gaze_velocity_naive <= max_sacc_speed).nonzero()[0]
    gaze_velocity_naive = np.interp(range(gaze_velocity_naive.shape[0]),gaze_velocity_naive_good_idx,gaze_velocity_naive[gaze_velocity_naive_good_idx])

    #append zero for first element, so size matches input
    return np.insert(gaze_velocity_naive,0,0.0)

'''
Function to classify gaze samples as saccade events using velocity thresholding.
Inputs: gaze_velocity: 1D array with N elements representing gaze velocity in degrees,
        time_stamps: 1D array with N elements representing timestamps in seconds,
        ivt_threshold: scalar in degrees, saccade samples are classified above this value
        min_dur: scalar in seconds, saccades shorter than this are discarded
Output: 2 element tuple.
        First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not
        Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index
'''
def saccade_classification_ivt(gaze_velocity,time_stamps,ivt_threshold=100.0,min_dur=.012,max_dur=2.0):
    #find samples over threshold
    sac_bool = gaze_velocity > ivt_threshold
    sac_intervals = intervals_from_bool(sac_bool)

    #correct saccades shorter than minimum duration
    sacc_durs = time_stamps[sac_intervals[:,1]] - time_stamps[sac_intervals[:,0]]
    short_sacc_idx = np.where((sacc_durs<min_dur)|(sacc_durs>max_dur))[0]

    #use short_sacc_idx to remove the bad intervals, and set those bad intervals to False in sac_bool
    bad_sac_intervals = sac_intervals[short_sacc_idx,:]

    sac_intervals_clean = np.delete(sac_intervals,short_sacc_idx,0)

    num_bad_intervals = len(bad_sac_intervals)
    for i in range(num_bad_intervals):
        curr_interval = bad_sac_intervals[i,:]
        sac_bool[curr_interval[0]:curr_interval[1]+1] = False

    return (sac_bool,sac_intervals_clean)

'''
Function to classify gaze samples as fixation events using velocity thresholding.
Inputs: gaze_velocity: 1D array with N elements representing gaze velocity in degrees,
        time_stamps: 1D array with N elements representing timestamps in seconds,
        ivt_threshold: scalar in degrees, fixation samples are classified below this value
        min_dur: scalar in seconds, fixations shorter than this are discarded
        max_dur: scalar in secodns, fixations longer than this are discarded
Output: 2 element tuple.
        First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not
        Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index
'''
def fixation_classification_ivt(gaze_velocity,time_stamps,ivt_threshold=20.0,min_dur=0.100,max_dur=2.0):
    #find samples over threshold
    fix_bool = gaze_velocity < ivt_threshold

    fix_intervals = intervals_from_bool(fix_bool)

    #correct fixation shorter than minimum duration
    fix_durs = time_stamps[fix_intervals[:,1]] - time_stamps[fix_intervals[:,0]]
    short_fix_idx = np.where((fix_durs<min_dur) | (fix_durs>max_dur))[0]
    #short_fix_idx = np.where(fix_durs<min_dur)[0]

    #use short_fix to remove the bad intervals, and set those bad intervals to False in fix_bool
    bad_fix_intervals = fix_intervals[short_fix_idx,:]
    fix_intervals_clean = np.delete(fix_intervals,short_fix_idx,0)

    num_bad_intervals = len(bad_fix_intervals)
    for i in range(num_bad_intervals):
        curr_interval = bad_fix_intervals[i,:]
        fix_bool[curr_interval[0]:curr_interval[1]+1] = False

    return (fix_bool,fix_intervals_clean)
'''
Compute angular dispersion over input set of 3D gaze vectors, computed based on max angular displacement
    of a sample in window from the centroid.
Inputs:  gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z)
Outputs: Maximum dispersion in degrees
'''
def compute_dispersion(gaze_vectors):
    #get gaze_distances in angle from centroid of gaze positions
    centroid = np.mean(gaze_vectors,0)

    #numerically stable method: atan2(norm(u-v),norm(u+v)) assuming u and v are already normalized
    gaze_dist = np.rad2deg(2*np.arctan2(np.sqrt(np.square(gaze_vectors-centroid)).sum(axis=1),np.sqrt(np.square(gaze_vectors+centroid)).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v))

    # gaze_dist = 2*np.arctan2(np.sqrt(np.square(gaze_vectors-gaze_vectors).sum(axis=1)),np.sqrt(np.square(gaze_vectors+gaze_vectors).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v))
    #make nans and infs zero
    # print(gaze_dist)
    np.nan_to_num(gaze_dist,copy=False,posinf=0.0,neginf=0.0)


    #return maximum
    return np.max(gaze_dist)

'''
Function to classify gaze samples as fixation events using I-DT by enforcing a maximum angular dispersion within a time window of min duration
Implementation based on:  https://github.com/ecekt/eyegaze/blob/master/gaze.py and https://github.com/ASAPLableni/VR-centred_I-DT_algorithm
Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z)
        time_stamps: 1D array with N elements representing timestamps in seconds,
        sac_bool: 1D boolean array indicating which samples were marked as saccades from I-VT, used to ensure samples do not have more than one label
        min_dur: scalar in seconds, fixations shorter than this are discarded
        max_dur: scalar in secodns, fixations longer than this are discarded
        max_disp: scalar in degrees, maximum spatial dispersion allowed within a time window for a fixation
        window_size: how big of window to bin samples within when classifying dispersion
Output: 2 element tuple.
        First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not
        Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index
'''
def fixation_classification_idt(gaze_vectors,time_stamps,sac_bool,min_dur=0.100,max_dur=2.0,max_disp=1.0):
    fix_bool = np.zeros(sac_bool.shape, dtype=bool)
    window_range = [0,0]

    current = 0 #pointer to represent the current beginning point of the window
    last = 0
    fix_idx = []

    while (current < len(gaze_vectors)):
        t0 = time_stamps[current] #beginning time
        t1 = t0 + min_dur     #time after a min. fix. threshold has been observed

        for r in range(current, len(gaze_vectors)):
            if(time_stamps[r]>= t0 and time_stamps[r]<= t1):
                last = r
            elif time_stamps[r] > t1:
                break

        window_range = [current,last]

        #now check the dispersion in this window
        dispersion = compute_dispersion(gaze_vectors[current:last+1,:])

        if (dispersion <= max_disp):
            #add new points
            while(dispersion <= max_disp and last + 1 < len(gaze_vectors)):
                last += 1
                window_range = [current,last]
                #print current, last, "*"
                #print "*"
                dispersion = compute_dispersion(gaze_vectors[current:last+1,:])

            #dispersion threshold is exceeded
            #fixation at the centroid [current,last]
            fix_bool[current:last+1] = True

            current = last + 1 #this will move the pointer to a novel window

        else:
            current += 1
            last = current

    #correct with saccade bool, find points marked as both, change to False in fix_bool
    conflict_idx = np.where((fix_bool == True) & (sac_bool == True))[0]
    fix_bool[conflict_idx] = False

    fix_intervals = intervals_from_bool(fix_bool)

    #correct fixation shorter than minimum duration or longer than max
    fix_durs = time_stamps[fix_intervals[:,1]] - time_stamps[fix_intervals[:,0]]
    short_fix_idx = np.where((fix_durs<min_dur) | (fix_durs>max_dur))[0]

    #use short_fix to remove the bad intervals, and set those bad intervals to False in fix_bool
    bad_fix_intervals = fix_intervals[short_fix_idx,:]
    fix_intervals_clean = np.delete(fix_intervals,short_fix_idx,0)

    num_bad_intervals = len(bad_fix_intervals)
    for i in range(num_bad_intervals):
        curr_interval = bad_fix_intervals[i,:]
        fix_bool[curr_interval[0]:curr_interval[1]+1] = False

    return (fix_bool,fix_intervals_clean)

'''
Summary: Compute a continous time series for feature values by averaging over events in the last N seconds
Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values
        fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times
        time_stamps: 1D array representing every sample in the time series, containing times in seconds
        N: Scalar indicating the number of seconds to consider past events within for computing average feature values
Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal.
'''
def compute_feature_signal_time_avg(features,fixation_times,time_stamps,N):
    num_features   = features.shape[1]
    num_rows       = len(time_stamps)
    feature_signal = np.zeros((num_rows,num_features),dtype=float)

    #loop over each time stamp, grab which elements have time greater than current time -2 but less than current time, average, add to row
    for i in range(num_rows):
        curr_time = time_stamps[i]

        #get events N seconds prior to curr_time
        curr_events = features[(fixation_times>=curr_time-N) & (fixation_times<=curr_time),:]

        curr_row = np.zeros((1,num_features),dtype=float)
        if len(curr_events) >0:
            curr_row = np.mean(curr_events,0)
        feature_signal[i,:] = curr_row

    return feature_signal

'''
Summary: Compute a continous time series for feature values by averaging over the past N events
Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values
        fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times
        time_stamps: 1D array representing every sample in the time series, containing times in seconds
        N: Scalar indicating the number of past events to consider when computing average feature values
Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal.
'''
def compute_feature_signal_event_avg(features,fixation_times,time_stamps,N):
    num_features   = features.shape[1]
    num_rows       = len(time_stamps)
    feature_signal = np.zeros((num_rows,num_features),dtype=float)

    #loop over each time stamp, grab past N elements, average, add to row
    for i in range(num_rows):
        curr_time = time_stamps[i]

        #get events prior to curr_time
        curr_events = features[(fixation_times<=curr_time),:]

        #grab last N of them if possible
        prior_events = curr_events[-N:,:]

        curr_row = np.zeros((1,num_features),dtype=float)
        if len(prior_events) >0:
            curr_row = np.mean(prior_events,0)
        feature_signal[i,:] = curr_row

    return feature_signal

#wrapper function to call feature signal methods
def compute_feature_signal(method,features,event_times,time_stamps,N):
    if method == 'time_avg':
        return compute_feature_signal_time_avg(features,event_times,time_stamps,N)
    elif method == 'event_avg':
        return compute_feature_signal_event_avg(features,event_times,time_stamps,N)
    elif method == 'event_discrete':
        return compute_feature_signal_event_discrete(features,event_times,time_stamps,N)
    else:
        return None

#returns three column np array (x,y,z) containing head corrected gaze vectors
def head_correction(df,param):
    gaze_vectors = df[['combined.gazeDirection.x','combined.gazeDirection.y','combined.gazeDirection.z']].values

    num_rows                 = np.size(gaze_vectors,0)
    transformed_gaze_vectors = np.zeros(gaze_vectors.shape)

    if param.transform_type == 'coordinate_frame':
        #transform vectors
        head_pos   = df[['head.pos.x','head.pos.y','head.pos.z']].values
        head_up    = df[['head.right.x','head.right.y','head.right.z']].values
        head_right = df[['head.up.x','head.up.y','head.up.z']].values
        head_dir   = df[['head.dir.x','head.dir.y','head.dir.z']].values

        for i in range(num_rows):
            #get current gaze vector and transforms
            gaze_vector     = gaze_vectors[i,:]
            curr_head_up    = head_up[i,:]
            curr_head_right = head_right[i,:]
            curr_head_dir   = head_dir[i,:]

            '''Transformation matrix of the form:
            [ R.x, R.y, R.z, 0,
            U.x, U.y, U.z, 0,
            D.x, D.y, D.z, 0,
            0, 0, 0, 1 ]
            '''
            #make transformation matrix
            transform_mat = np.ma.row_stack((np.append(curr_head_right ,0),\
                                            np.append(curr_head_up ,0),\
                                            np.append(curr_head_dir,0),\
                                            np.array([0.0, 0.0, 0.0, 1.0])))


            #add homogenous coordinate to vector
            gaze_vector = np.append(gaze_vector,1.0)

            #right-multiply row vector with transpose of matrix as defined above
            transformed_gaze_vector = np.matmul(gaze_vector,transform_mat.T)

            #place transformed gaze_vector back into gaze_vectors w/o homogenous coordinate
            transformed_gaze_vectors[i,:] = transformed_gaze_vector[0:3]

        #normalize all rows again to account for any rounding errors
        row_norm_vals = np.sqrt(np.square(transformed_gaze_vectors).sum(axis=1))
        transformed_gaze_vectors = transformed_gaze_vectors/row_norm_vals[:,None]

    elif param.transform_type == 'quaternions':
        q_x = df['RotationX'].values
        q_y = df['RotationY'].values
        q_z = df['RotationZ'].values
        q_w = df['RotationW'].values

        for i in range(num_rows):
            #get current gaze vector and transforms
            gaze_vector = gaze_vectors[i,:]

            curr_q_x    = q_x[i]
            curr_q_y    = q_y[i]
            curr_q_z    = q_z[i]
            curr_q_w    = q_w[i]

            # first column
            one_one = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z)
            two_one = 2 * (curr_q_w*curr_q_z + curr_q_x*curr_q_y)
            three_one = 2 * (curr_q_x*curr_q_z - curr_q_w*curr_q_y)

            # second column
            one_two = 2* (curr_q_x*curr_q_y - curr_q_w*curr_q_z)
            two_two = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z)
            three_two = 2* (curr_q_w*curr_q_x - curr_q_y*curr_q_z)

            # third column
            one_three = 2* (curr_q_w*curr_q_y - curr_q_x*curr_q_z)
            two_three = 2* (curr_q_y*curr_q_z - curr_q_w*curr_q_x)
            three_three = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z)

            # compute rotation matrix as per Diaz et al.
            rot_mat = np.array([[one_one, two_one, three_one],\
                                [one_two, two_two, three_two],\
                                [one_three, two_three, three_three]])

            transformed_gaze_vectors[i,:] = np.matmul(rot_mat,gaze_vector)

        #normalize all rows again to account for any rounding errors
        row_norm_vals = np.sqrt(np.square(transformed_gaze_vectors).sum(axis=1))
        transformed_gaze_vectors = transformed_gaze_vectors/row_norm_vals[:,None]

    return transformed_gaze_vectors

def window_signal_overlap(X, y, window_size=10):
	M, N = X.shape
	windows = np.array([])
	lbls = []

	FP_indxs = np.where(y==1)[0]
	TP_indxs = np.where(y==-1)[0]
	combined = np.hstack((FP_indxs, TP_indxs))

	for i in combined:
		window = np.array([])
		starts = range(max(0, i-window_size+1), i+1)
		ends = range(i+1, min(M, i+window_size))

		for (s, e) in zip(starts, ends):
			window = X[s:e]
			curr_size = window.shape[0]
			window = window.reshape(-1, curr_size*N)

			# window is clipped from front or back, add zero padding
			if curr_size < window_size:
				padding_size = window_size*N - curr_size*N
				padding = np.zeros(shape=(1,padding_size))
				if i-window_size < 0:
					window = np.hstack((padding,window))
				else:
					window = np.hstack((window, padding))

			if windows.shape[0] == 0:
				windows = window
			else:
				windows = np.vstack((windows, window))
			lbl = y[i]
			lbls.append(lbl)


	lbls = np.array(lbls, dtype=int).reshape(-1,1)
	return (windows, lbls)


def window_signal_center(X, y, window_size=10):
	M, N = X.shape
	windows = np.array([])
	lbls = []

	FP_indxs = np.where(y==1)[0]
	TP_indxs = np.where(y==-1)[0]
	combined = np.hstack((FP_indxs, TP_indxs))

	for i in combined:
		start = max(0, i-int(window_size/2))
		end = min(M, i+int(window_size/2))

		window = X[start:end, :]
		curr_size = window.shape[0]
		window = window.reshape(-1, curr_size*N)

		# window is clipped from front or back, add zero padding
		if curr_size < window_size:
			padding_size = window_size*N - curr_size*N
			padding = np.zeros(shape=(1,padding_size))
			if i-int(window_size/2) < 0:
				window = np.hstack((padding,window))
			else:
				window = np.hstack((window, padding))

		if windows.shape[0] == 0:
			windows = window
		else:
			windows = np.vstack((windows, window))
		lbl = y[i]
		lbls.append(lbl)

	lbls = np.array(lbls, dtype=int).reshape(-1,1)
	return (windows, lbls)

def window_signal_start(X, y, window_size=10):
	M, N = X.shape
	windows = np.array([])
	lbls = []

	FP_indxs = np.where(y==1)[0]
	TP_indxs = np.where(y==-1)[0]
	combined = np.hstack((FP_indxs, TP_indxs))

	for i in combined:
		start = i
		end = min(M, i+window_size)

		window = X[start:end, :]
		curr_size = window.shape[0]
		window = window.reshape(-1, curr_size*N)

		# window is clipped from front or back, add zero padding
		if curr_size < window_size:
			padding_size = window_size*N - curr_size*N
			padding = np.zeros(shape=(1,padding_size))
			window = np.hstack((window, padding))

		if windows.shape[0] == 0:
			windows = window
		else:
			windows = np.vstack((windows, window))
		lbl = y[i]
		lbls.append(lbl)

	lbls = np.array(lbls, dtype=int).reshape(-1,1)
	return (windows, lbls)

def window_signal(X, y, mode='center', window_size=10):
	if mode == 'center':
		return window_signal_center(X, y, window_size=window_size)
	elif mode == 'start':
		return window_signal_start(X, y, window_size=window_size)
	elif mode == 'overlap':
		return window_signal_overlap(X, y, window_size=window_size)

def get_header(features, window_size):
	header = ""
	for i in range(window_size):
		h = ['%s_%d'%(f, i) for f in features]
		h = ",".join(h)
		if not len(header):
			header = h
			continue
		header = '%s,%s'%(header, h)
	header = '%s,%s'%(header, 'label')
	return header

def is_discrete_feat(feat):
	isdisc = False
	for f in discrete_feats:
		if f in feat:
			isdisc = True
			break
	return isdisc

def read_data(path):
	files = glob.glob(os.path.join(path, '*'))
	y = []
	X = []
	subjs = []
	discrete_cols = []
	cols = []

	for file in files:
		subj = file.split(os.sep)[-1]
		subj = subj.split('.')[0]
		subj = subj.split('_')[0]

		if subj in subjs_skip:
			continue

		data = pd.read_csv(file)

		if not len(discrete_cols):
			cols = data.columns
			discrete_cols = []
			for i in range(len(cols)):
				if is_discrete_feat(cols[i]):
					discrete_cols.append(cols[i])


		if not len(y):
			y = data['label'].to_numpy().reshape(-1, 1)
		else:
			y = np.vstack((y, data['label'].to_numpy().reshape(-1,1)))

		curr_X = data[data.columns[data.columns != 'label']].to_numpy()

		if not len(X):
			X = curr_X
		else:
			X = np.vstack((X, curr_X))

		M = curr_X.shape[0]
		curr_subjs = np.repeat(subj, M).reshape(-1, 1)
		if not len(subjs):
			subjs = curr_subjs
		else:
			subjs = np.vstack((subjs, curr_subjs))

	return (X, y, subjs, discrete_cols, cols)


def read_feat_data(path):
	''' Reads feature data in director 'path' and stores them in PD data frame'''
	files = glob.glob(os.path.join(path, '*'))

	combined = pd.DataFrame()
	for file in files:
		data = pd.read_csv(file, dtype={'selection_type': 'str'}, index_col=0)

		filename = file.split(os.sep)[-1]
		filename = filename.split('.')[0]
		subj = filename.split('_')[0]
		data['sbj'] = subj
		block = filename.split('_')[1]
		data['block'] = block

		combined = combined.append(data)

	return combined

def read_window_data(main_dir, MODE, WINDOW_SIZE, data_type='train'):
	data_dir = os.path.join(main_dir, 'w_%s_%d'%(MODE, WINDOW_SIZE))
	data_dir = os.path.join(data_dir, data_type)
	X, y, subjs, discrete_cols, cols = read_data(main_dir)
	return X, y, subjs, discrete_cols, cols


def normalize_subj_data(X, y, subjs, exclude_idxs):
	X_out, y_out = [], []
	sbjs = np.unique(subjs)
	for sbj in sbjs:
		subj_idxs = np.where(subjs==sbj)[0]
		X_subj, y_subj = X[subj_idxs], y[subj_idxs]

		if X_subj.shape[0] < 1:
			continue

		if len(exclude_idxs):
			mask = ~np.isin(np.arange(X.shape[1]), exclude_idxs)
			X_subj[:,mask] = scale(X_subj[:,mask])
		else:
			X_Subj = scale(X_subj)

		if not len(X_out):
			X_out = X_subj
			y_out = y_subj
		else:
			X_out = np.vstack((X_out, X_subj))
			y_out = np.vstack((y_out, y_subj))
	return X_out, y_out

def get_feat_indxs(columns, query_cols):
    indxs = []
    for i in range(len(columns)):
        c = columns[i]
        c = c.split('_')
        c.pop()
        c = '_'.join(c)
        if c in query_cols:
            indxs.append(i)
    return indxs

def filter_features(featfilepath, X, headers, count_thresh=1):
    feat_counter = pd.read_csv(featfilepath,squeeze=True)

    imp_indxs = np.where(feat_counter['num']>count_thresh)[0]
    important_feats = feat_counter['feat'].values[imp_indxs]
    indxs = get_feat_indxs(headers, important_feats)
    X = X[:,indxs]
    return X


def compute_feature_signal_time_avg(features,fixation_times,time_stamps,N):
    num_features   = features.shape[1]
    num_rows       = len(time_stamps)
    feature_signal = np.zeros((num_rows,num_features),dtype=float)

    #loop over each time stamp, grab which elements have time greater than current time -2 but less than current time, average, add to row
    for i in range(num_rows):
        curr_time = time_stamps[i]

        #get events N seconds prior to curr_time
        curr_events = features[(fixation_times>=curr_time-N) & (fixation_times<=curr_time),:]

        curr_row = np.zeros((1,num_features),dtype=float)
        if len(curr_events) >0:
            curr_row = np.mean(curr_events,0)
        feature_signal[i,:] = curr_row

    return feature_signal

'''
Summary: Compute a continous time series for feature values by averaging over the past N events
Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values
        fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times
        time_stamps: 1D array representing every sample in the time series, containing times in seconds
        N: Scalar indicating the number of past events to consider when computing average feature values
Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal.
'''
def compute_feature_signal_event_avg(features,fixation_times,time_stamps,N):
    num_features   = features.shape[1]
    num_rows       = len(time_stamps)
    feature_signal = np.zeros((num_rows,num_features),dtype=float)

    #loop over each time stamp, grab past N elements, average, add to row
    for i in range(num_rows):
        curr_time = time_stamps[i]

        #get events prior to curr_time
        curr_events = features[(fixation_times<=curr_time),:]

        #grab last N of them if possible
        prior_events = curr_events[-N:,:]

        curr_row = np.zeros((1,num_features),dtype=float)
        if len(prior_events) >0:
            curr_row = np.mean(prior_events,0)
        feature_signal[i,:] = curr_row

    return feature_signal

#wrapper function to call feature signal methods
def compute_feature_signal(method,features,event_times,time_stamps,N):
    if method == 'time_avg':
        return compute_feature_signal_time_avg(features,event_times,time_stamps,N)
    elif method == 'event_avg':
        return compute_feature_signal_event_avg(features,event_times,time_stamps,N)
    elif method == 'event_discrete':
        return compute_feature_signal_event_discrete(features,event_times,time_stamps,N)
    else:
        return None

def get_header(features, window_size):
	header = ""
	for i in range(window_size):
		h = ['%s_%d'%(f, i) for f in features]
		h = ",".join(h)
		if not len(header):
			header = h
			continue
		header = '%s,%s'%(header, h)
	header = '%s,%s'%(header, 'label')
	return header