''' Created on 25.07.2012 @author: Jakob Steidl module to detect outliers (and for now also to create a cell/matrix) outliers will be marked in a numpy matrix. each 3D vector representing the outier/state of each velocity component. its currently using the RawProfileData (because they have the 3D components?) but that can be changed by the arguments in get_valuematrix_from_cellmatrix(). one needs to change the proper cell matrix as well however. ''' import pickle, time import numpy as np import warnings #import pylab as pl #import matplotlib.pyplot as plt #warnings.simplefilter('error') def get_matrix_size(p): ''' determine the necessary size of a matrix that can hold all the cells ''' # determine size of matrix size_h = len(p.ensembles) size_v = [] for i in range(0,size_h-1): try: size_v.append(len(p.ensembles[i].cells)) except AttributeError: #print('DEBUG: ensemble {} of {} doesnt have cells'.format(i, size_h)) pass size_v = max(size_v) # rows return (size_v, size_h) def get_cell_matrix(p): ''' extracts the cells and stores them in a 2D list (aka matrix) cmat[row][col] ''' (rows, cols) = get_matrix_size(p) # create empty standard python 'matrix' cmat = [] for i in range(rows): #@UnusedVariable cmat += [[False] * cols ] # iterate trough every cell for e in range(cols): if not p.ensembles[e].void: for c in range(len(p.ensembles[e].cells)): cmat[c][e] = p.ensembles[e].cells[c] return cmat def get_valuematrix_from_cellmatrix(m, value, outputs=0, nvalues=3): ''' return two matrices: 1. a designated @value from the matrix @m containing Cell()-objects 2. a matrix with np.bool's (False if no valid value could be extracted, True otherwise) ''' rows = len(m) cols = len(m[0]) npmat = np.zeros(shape=(rows, cols, nvalues)) goodmat = np.zeros(shape=(rows, cols), dtype=np.bool) for r in range(rows): for c in range(cols): if not False == m[r][c]: npmat[r][c] = eval('m[{}][{}]{}'.format(r,c, value)) goodmat[r][c] = True if outputs == 0: return npmat, goodmat elif outputs == 1: return npmat elif outputs == 2: return goodmat def get_relative_deviation_simple(vmat, vgoodmat, cfg): ''' return a matrix containing True, if corresponding cell contains an outlier cfg vars: (its a dictionary) radius_h Int horizontal radius of where we do the averaging radius_v Int vertical radius of where we do the averaging ''' if len(vgoodmat.shape) == 2: (rows, columns) = vgoodmat.shape else: rows, = vgoodmat.shape columns = 1 rdev = np.zeros(shape=(rows, columns, 3), dtype=np.float) radius_v = np.int(cfg['radius_v']) if 'radius_v' in cfg else 0 radius_h = np.int(cfg['radius_h']) onlygoodneighbours_temp = [ np.zeros(shape=((2*radius_h+1) * (2*radius_v+1)), dtype=np.float), np.zeros(shape=((2*radius_h+1) * (2*radius_v+1)), dtype=np.float), np.zeros(shape=((2*radius_h+1) * (2*radius_v+1)), dtype=np.float) ] # iterate trough every cell ... for r in range(rows): for c in range(columns): if vgoodmat[r,c] == True: gnc = 0 # good/neighbors counter onlygoodneighbours = onlygoodneighbours_temp[:] # workaround: range(x,x) doesn't work (happens if radius_v == 0) if radius_v == 0: range_r = [r] else: range_r = list(range(max(r-radius_v,0), min(r+radius_v, rows))) for nr in range_r: for nc in range(max(c-radius_h,0), min(c+radius_h, columns)): if vgoodmat[nr, nc]: onlygoodneighbours[0][gnc] = vmat[nr,nc][0] onlygoodneighbours[1][gnc] = vmat[nr,nc][1] onlygoodneighbours[2][gnc] = vmat[nr,nc][2] gnc += 1 if gnc != 1: mean = np.array([ onlygoodneighbours[0][:gnc].mean(), onlygoodneighbours[1][:gnc].mean(), onlygoodneighbours[2][:gnc].mean() ]) stdev = np.array([ onlygoodneighbours[0][:gnc].std(), onlygoodneighbours[1][:gnc].std(), onlygoodneighbours[2][:gnc].std() ]) value = vmat[r,c] # compute the relative deviation (deviation x-times the standard deviation) rdev[r,c] = np.divide(np.abs(mean - value), stdev) else: warnings.warn('cell {}/{}: not enough neighbors to gather statistics. increase search radius.'.format(r, c), RuntimeWarning) return rdev def get_outliers(rd, gm, cfg): ''' meanings for the returned matrix: 1 ... outlier 0 ... no outlier ''' return rd > cfg['limit'] def interpolate_outliers(p, cfg): ''' remove outliers and return a ProcessedProfileObj with outliers already removed and interpolated cfg vars: (its a dictionary) radius_h int horizontal radius of where we do the averaging radius_v int vertical radius of where we do the averaging limit float limit above which outliers are recognised as such ''' # outliers cm = get_cell_matrix(p) # cell matrix vm, vgm = get_valuematrix_from_cellmatrix(cm, '.velocity.v') # value matri/x, and corresponding mask (1=good, 0=bad) rdm = get_relative_deviation_simple(vm, vgm, cfg) # relative deviation matrix olm = get_outliers(rdm, vgm, cfg) # outlier matrix (boolean, 1=outlier?) # interpolation from interpolation import interpolate ivm = interpolate(vm, ~olm) # update processed profile from copy import deepcopy p1 = deepcopy(p) p1.update_velocities(ivm, vgm) return p1 def plot_matrix(m): import matplotlib.pyplot as plt plt.imshow(m, interpolation='none') plt.show() #if __name__ == '__main__': # profile = pickle.load(open('../testfiles/demodata.pickle','rb')) # # ## cfg = dict(radius=5, limit=2) # cfg = dict(radius_h=15, radius_v=0, limit=2) # # t1 = time.time() # cm = get_cell_matrix(profile) # vm, vgm = get_valuematrix_from_cellmatrix(cm, '.velocity_comp.v') # # rdm = get_relative_deviation_simple(vm, vgm, cfg) # # dt1 = time.time() - t1 # t2 = time.time() # olm = get_outliers(rdm, vgm, cfg) # dt2 = time.time() - t2 # # print(type(olm)) # print dt1, dt2 # plot_matrix(olm[:,:,0]-1*vgm) # # pickle.dump((cm, vm, vgm, rdm, olm), open('../testfiles/outliers.pickle','wb'))