AJMR-Python-Baird/Mustique/adcptool/outliers.py

241 lines
7.7 KiB
Python

'''
Created on 25.07.2012
@author: Jakob Steidl
module to detect outliers (and for now also to create a cell/matrix)
outliers will be marked in a numpy matrix. each 3D vector representing the outier/state of each velocity component.
its currently using the RawProfileData (because they have the 3D components?) but that can be changed by the arguments
in get_valuematrix_from_cellmatrix(). one needs to change the proper cell matrix as well however.
'''
import pickle, time
import numpy as np
import warnings
#import pylab as pl
#import matplotlib.pyplot as plt
#warnings.simplefilter('error')
def get_matrix_size(p):
''' determine the necessary size of a matrix that can hold all the cells '''
# determine size of matrix
size_h = len(p.ensembles)
size_v = []
for i in range(0,size_h-1):
try:
size_v.append(len(p.ensembles[i].cells))
except AttributeError:
#print('DEBUG: ensemble {} of {} doesnt have cells'.format(i, size_h))
pass
size_v = max(size_v) # rows
return (size_v, size_h)
def get_cell_matrix(p):
''' extracts the cells and stores them in a 2D list (aka matrix)
cmat[row][col]
'''
(rows, cols) = get_matrix_size(p)
# create empty standard python 'matrix'
cmat = []
for i in range(rows): #@UnusedVariable
cmat += [[False] * cols ]
# iterate trough every cell
for e in range(cols):
if not p.ensembles[e].void:
for c in range(len(p.ensembles[e].cells)):
cmat[c][e] = p.ensembles[e].cells[c]
return cmat
def get_valuematrix_from_cellmatrix(m, value, outputs=0, nvalues=3):
''' return two matrices:
1. a designated @value from the matrix @m containing Cell()-objects
2. a matrix with np.bool's (False if no valid value could be extracted, True otherwise)
'''
rows = len(m)
cols = len(m[0])
npmat = np.zeros(shape=(rows, cols, nvalues))
goodmat = np.zeros(shape=(rows, cols), dtype=np.bool)
for r in range(rows):
for c in range(cols):
if not False == m[r][c]:
npmat[r][c] = eval('m[{}][{}]{}'.format(r,c, value))
goodmat[r][c] = True
if outputs == 0:
return npmat, goodmat
elif outputs == 1:
return npmat
elif outputs == 2:
return goodmat
def get_relative_deviation_simple(vmat, vgoodmat, cfg):
''' return a matrix containing True, if corresponding cell
contains an outlier
cfg vars: (its a dictionary)
radius_h Int horizontal radius of where we do the averaging
radius_v Int vertical radius of where we do the averaging
'''
if len(vgoodmat.shape) == 2:
(rows, columns) = vgoodmat.shape
else:
rows, = vgoodmat.shape
columns = 1
rdev = np.zeros(shape=(rows, columns, 3), dtype=np.float)
radius_v = np.int(cfg['radius_v']) if 'radius_v' in cfg else 0
radius_h = np.int(cfg['radius_h'])
onlygoodneighbours_temp = [
np.zeros(shape=((2*radius_h+1) * (2*radius_v+1)), dtype=np.float),
np.zeros(shape=((2*radius_h+1) * (2*radius_v+1)), dtype=np.float),
np.zeros(shape=((2*radius_h+1) * (2*radius_v+1)), dtype=np.float) ]
# iterate trough every cell ...
for r in range(rows):
for c in range(columns):
if vgoodmat[r,c] == True:
gnc = 0 # good/neighbors counter
onlygoodneighbours = onlygoodneighbours_temp[:]
# workaround: range(x,x) doesn't work (happens if radius_v == 0)
if radius_v == 0:
range_r = [r]
else:
range_r = list(range(max(r-radius_v,0), min(r+radius_v, rows)))
for nr in range_r:
for nc in range(max(c-radius_h,0), min(c+radius_h, columns)):
if vgoodmat[nr, nc]:
onlygoodneighbours[0][gnc] = vmat[nr,nc][0]
onlygoodneighbours[1][gnc] = vmat[nr,nc][1]
onlygoodneighbours[2][gnc] = vmat[nr,nc][2]
gnc += 1
if gnc != 1:
mean = np.array([
onlygoodneighbours[0][:gnc].mean(),
onlygoodneighbours[1][:gnc].mean(),
onlygoodneighbours[2][:gnc].mean()
])
stdev = np.array([
onlygoodneighbours[0][:gnc].std(),
onlygoodneighbours[1][:gnc].std(),
onlygoodneighbours[2][:gnc].std()
])
value = vmat[r,c]
# compute the relative deviation (deviation x-times the standard deviation)
rdev[r,c] = np.divide(np.abs(mean - value), stdev)
else:
warnings.warn('cell {}/{}: not enough neighbors to gather statistics. increase search radius.'.format(r, c), RuntimeWarning)
return rdev
def get_outliers(rd, gm, cfg):
'''
meanings for the returned matrix:
1 ... outlier
0 ... no outlier
'''
return rd > cfg['limit']
def interpolate_outliers(p, cfg):
'''
remove outliers and return a ProcessedProfileObj with outliers already removed
and interpolated
cfg vars: (its a dictionary)
radius_h int horizontal radius of where we do the averaging
radius_v int vertical radius of where we do the averaging
limit float limit above which outliers are recognised as such
'''
# outliers
cm = get_cell_matrix(p) # cell matrix
vm, vgm = get_valuematrix_from_cellmatrix(cm, '.velocity.v') # value matri/x, and corresponding mask (1=good, 0=bad)
rdm = get_relative_deviation_simple(vm, vgm, cfg) # relative deviation matrix
olm = get_outliers(rdm, vgm, cfg) # outlier matrix (boolean, 1=outlier?)
# interpolation
from interpolation import interpolate
ivm = interpolate(vm, ~olm)
# update processed profile
from copy import deepcopy
p1 = deepcopy(p)
p1.update_velocities(ivm, vgm)
return p1
def plot_matrix(m):
import matplotlib.pyplot as plt
plt.imshow(m, interpolation='none')
plt.show()
#if __name__ == '__main__':
# profile = pickle.load(open('../testfiles/demodata.pickle','rb'))
#
#
## cfg = dict(radius=5, limit=2)
# cfg = dict(radius_h=15, radius_v=0, limit=2)
#
# t1 = time.time()
# cm = get_cell_matrix(profile)
# vm, vgm = get_valuematrix_from_cellmatrix(cm, '.velocity_comp.v')
#
# rdm = get_relative_deviation_simple(vm, vgm, cfg)
#
# dt1 = time.time() - t1
# t2 = time.time()
# olm = get_outliers(rdm, vgm, cfg)
# dt2 = time.time() - t2
#
# print(type(olm))
# print dt1, dt2
# plot_matrix(olm[:,:,0]-1*vgm)
#
# pickle.dump((cm, vm, vgm, rdm, olm), open('../testfiles/outliers.pickle','wb'))