|
@@ -1,312 +0,0 @@
|
|
|
-"""
|
|
|
- This file is intended to perfom certain machine learning tasks based on numpy
|
|
|
- We are trying to keep it lean that's why no sklearn involved yet
|
|
|
-
|
|
|
- @TODO:
|
|
|
- Create factory method for the learners implemented here
|
|
|
- Improve preconditions (size of the dataset, labels)
|
|
|
-"""
|
|
|
-from __future__ import division
|
|
|
-import numpy as np
|
|
|
-
|
|
|
-class ML:
|
|
|
- @staticmethod
|
|
|
- def Filter (attr,value,data) :
|
|
|
- #
|
|
|
- # @TODO: Make sure this approach works across all transport classes
|
|
|
- # We may have a potential issue of how the data is stored ... it may not scale
|
|
|
- #
|
|
|
- value = ML.CleanupName(value)
|
|
|
- #return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
|
|
|
- #return [[item for item in row if item[attr] == value][0] for row in data]
|
|
|
- #
|
|
|
- # We are making the filtering more rescillient, i.e if an item doesn't exist we don't have to throw an exception
|
|
|
- # This is why we expanded the loops ... fully expressive but rescilient
|
|
|
- #
|
|
|
- r = []
|
|
|
- for row in data :
|
|
|
- if isinstance(row,list) :
|
|
|
- for item in row :
|
|
|
-
|
|
|
- if attr in item and item[attr] == value:
|
|
|
- r.append(item)
|
|
|
- else:
|
|
|
- #
|
|
|
- # We are dealing with a vector of objects
|
|
|
- #
|
|
|
- if attr in row and row[attr] == value:
|
|
|
- r.append(row)
|
|
|
-
|
|
|
- return r
|
|
|
- @staticmethod
|
|
|
- def Extract(lattr,data):
|
|
|
- if isinstance(lattr,basestring):
|
|
|
- lattr = [lattr]
|
|
|
- # return [[row[id] for id in lattr] for row in data]
|
|
|
- r = [[row[id] for id in lattr] for row in data]
|
|
|
- if len(lattr) == 1 :
|
|
|
- return [x[0] for x in r]
|
|
|
- else:
|
|
|
- return r
|
|
|
- @staticmethod
|
|
|
- def CleanupName(value) :
|
|
|
- return value.replace('$','').replace('.+','')
|
|
|
- @staticmethod
|
|
|
- def distribution(xo,lock,scale=False) :
|
|
|
-
|
|
|
- d = []
|
|
|
- m = {}
|
|
|
- if scale :
|
|
|
- xu = np.mean(xo)
|
|
|
- sd = np.sqrt(np.var(xo))
|
|
|
- for xi in xo :
|
|
|
- value = round(xi,2)
|
|
|
- if scale :
|
|
|
- value = round((value - xu)/sd,2)
|
|
|
- id = str(value)
|
|
|
- lock.acquire()
|
|
|
- if id in m :
|
|
|
- index = m[id]
|
|
|
- d[index][1] += 1
|
|
|
- else:
|
|
|
- m[id] = len(d)
|
|
|
- d.append([value,1])
|
|
|
- lock.release()
|
|
|
- del m
|
|
|
- return d
|
|
|
-
|
|
|
-"""
|
|
|
- Implements a multivariate anomaly detection
|
|
|
- @TODO: determine computationally determine epsilon
|
|
|
-"""
|
|
|
-class AnomalyDetection:
|
|
|
- def __init__(self):
|
|
|
- pass
|
|
|
- def split(self,data,index=-1,threshold=0.65) :
|
|
|
- N = len(data)
|
|
|
- # if N < LIMIT:
|
|
|
- # return None
|
|
|
-
|
|
|
- end = int(N*threshold)
|
|
|
- train = data[:end]
|
|
|
- test = data[end:]
|
|
|
-
|
|
|
- return {"train":train,"test":test}
|
|
|
-
|
|
|
- """
|
|
|
-
|
|
|
- @param key field name by which the data will be filtered
|
|
|
- @param value field value for the filter
|
|
|
- @param features features to be used in the analysis
|
|
|
- @param labels used to assess performance
|
|
|
- @TODO: Map/Reduce does a good job at filtering
|
|
|
- """
|
|
|
- def learn(self,data,key,value,features,label):
|
|
|
-
|
|
|
-
|
|
|
- if len(data) < 10:
|
|
|
- return None
|
|
|
- xo = ML.Filter(key,value,data)
|
|
|
- if len(xo) < 10 :
|
|
|
- return None
|
|
|
- # attr = conf['features']
|
|
|
- # label= conf['label']
|
|
|
-
|
|
|
- yo= ML.Extract([label['name']],xo)
|
|
|
- xo = ML.Extract(features,xo)
|
|
|
- yo = self.getLabel(yo,label)
|
|
|
- #
|
|
|
- # @TODO: Insure this can be finetuned, training size matters for learning. It's not obvious to define upfront
|
|
|
- #
|
|
|
- xo = self.split(xo)
|
|
|
- yo = self.split(yo)
|
|
|
- p = self.gParameters(xo['train'])
|
|
|
- has_cov = np.linalg.det(p['cov']) if p else False #-- making sure the matrix is invertible
|
|
|
-
|
|
|
- if xo['train'] and has_cov :
|
|
|
- E = 0.001
|
|
|
- ACCEPTABLE_FSCORE = 0.6
|
|
|
- fscore = 0
|
|
|
- #
|
|
|
- # We need to find an appropriate epsilon for the predictions
|
|
|
- # The appropriate epsilon is one that yields an f-score [0.5,1[
|
|
|
- #
|
|
|
-
|
|
|
- __operf__ = None
|
|
|
- perf = None
|
|
|
- for i in range(0,10):
|
|
|
- Epsilon = E + (2*E*i)
|
|
|
-
|
|
|
- if p is None :
|
|
|
- return None
|
|
|
- #
|
|
|
- # At this point we've got enough data for the parameters
|
|
|
- # We should try to fine tune epsilon for better results
|
|
|
- #
|
|
|
-
|
|
|
- px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
|
|
|
-
|
|
|
-
|
|
|
- __operf__ = self.gPerformance(px,yo['test'])
|
|
|
- print value,__operf__
|
|
|
- if __operf__['fscore'] == 1 :
|
|
|
- continue
|
|
|
- if perf is None :
|
|
|
- perf = __operf__
|
|
|
- elif perf['fscore'] < __operf__['fscore'] and __operf__['fscore'] > ACCEPTABLE_FSCORE :
|
|
|
- perf = __operf__
|
|
|
- perf['epsilon'] = Epsilon
|
|
|
- #
|
|
|
- # At this point we are assuming we came out of the whole thing with an acceptable performance
|
|
|
- # The understanding is that error drives performance thus we reject fscore==1
|
|
|
- #
|
|
|
-
|
|
|
- if perf and perf['fscore'] > ACCEPTABLE_FSCORE :
|
|
|
- return {"label":value,"parameters":p,"performance":perf}
|
|
|
- else:
|
|
|
- return None
|
|
|
- return None
|
|
|
- """
|
|
|
- This function determines if the preconditions for learning are met
|
|
|
- For that parameters are passed to the function
|
|
|
- p
|
|
|
- """
|
|
|
- def canLearn(self,p) :
|
|
|
- pass
|
|
|
- def getLabel(self,yo,label_conf):
|
|
|
- return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
|
|
|
-
|
|
|
-
|
|
|
- """
|
|
|
- This function will compute the probability density function given a particular event/set of events
|
|
|
- The return value is [px,yo]
|
|
|
- @pre xu.shape[0] == sigma[0] == sigma[1]
|
|
|
- """
|
|
|
- def gPx(self,xu,sigma,data,EPSILON=0.01):
|
|
|
- n = len(data[0])
|
|
|
-
|
|
|
- r = []
|
|
|
- a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
|
|
|
- # EPSILON = np.float64(EPSILON)
|
|
|
- test = np.array(data)
|
|
|
- for row in test:
|
|
|
- row = np.array(row)
|
|
|
- d = np.matrix(row - xu)
|
|
|
- d.shape = (n,1)
|
|
|
-
|
|
|
- b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
|
|
|
-
|
|
|
- px = float(b/a)
|
|
|
- r.append([px,int(px < EPSILON)])
|
|
|
- return r
|
|
|
- """
|
|
|
- This function uses stored learnt information to predict on raw data
|
|
|
- In this case it will determin if we have an anomaly or not
|
|
|
- @param xo raw observations (matrix)
|
|
|
- @param info stored information about this
|
|
|
- """
|
|
|
- def predict(self,xo,info):
|
|
|
-
|
|
|
- xo = ML.Extract(info['features'],xo)
|
|
|
-
|
|
|
- if not xo :
|
|
|
- return None
|
|
|
-
|
|
|
- sigma = info['parameters']['cov']
|
|
|
- xu = info['parameters']['mean']
|
|
|
- epsilon = info['performance']['epsilon']
|
|
|
-
|
|
|
- return self.gPx(xu,sigma,xo,epsilon)
|
|
|
- """
|
|
|
- This function computes performance metrics i.e precision, recall and f-score
|
|
|
- for details visit https://en.wikipedia.org/wiki/Precision_and_recall
|
|
|
-
|
|
|
- """
|
|
|
- def gPerformance(self,test,labels) :
|
|
|
- N = len(test)
|
|
|
- tp = 0 # true positive
|
|
|
- fp = 0 # false positive
|
|
|
- fn = 0 # false negative
|
|
|
- tn = 0 # true negative
|
|
|
- for i in range(0,N):
|
|
|
- tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
|
|
|
- fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
|
|
|
- fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
|
|
|
- tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
|
|
|
- precision = tp /( (tp + fp) if tp + fp > 0 else 1)
|
|
|
- recall = tp / ((tp + fn) if tp + fn > 0 else 1)
|
|
|
-
|
|
|
- fscore = (2 * precision * recall)/ ((precision + recall) if (precision + recall) > 0 else 1)
|
|
|
- return {"precision":precision,"recall":recall,"fscore":fscore}
|
|
|
-
|
|
|
- """
|
|
|
- This function returns gaussian parameters i.e means and covariance
|
|
|
- The information will be used to compute probabilities
|
|
|
- """
|
|
|
- def gParameters(self,train) :
|
|
|
-
|
|
|
- n = len(train[0])
|
|
|
- m = np.transpose(np.array(train))
|
|
|
-
|
|
|
- u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
|
|
|
- if np.sum(u) == 0:
|
|
|
- return None
|
|
|
- r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
|
|
|
- #
|
|
|
- # Before we normalize the data we must insure there's is some level of movement in this application
|
|
|
- # A lack of movement suggests we may not bave enough information to do anything
|
|
|
- #
|
|
|
- if 0 in r :
|
|
|
- return None
|
|
|
- #
|
|
|
- #-- Normalizing the matrix then we will compute covariance matrix
|
|
|
- #
|
|
|
-
|
|
|
- m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
|
|
|
- sigma = np.cov(m)
|
|
|
- sigma = [ list(row) for row in sigma]
|
|
|
- return {"cov":sigma,"mean":list(u)}
|
|
|
-
|
|
|
-class AnalyzeAnomaly(AnomalyDetection):
|
|
|
- def __init__(self):
|
|
|
- AnomalyDetection.__init__(self)
|
|
|
- """
|
|
|
- This analysis function will include a predicted status because an anomaly can either be
|
|
|
- - A downtime i.e end of day
|
|
|
- - A spike and thus a potential imminent crash
|
|
|
- @param xo matrix of variables
|
|
|
- @param info information about what was learnt
|
|
|
- """
|
|
|
- def predict(self,xo,info):
|
|
|
- x = xo[len(xo)-1]
|
|
|
- r = AnomalyDetection.predict(self,[x],info)
|
|
|
- #
|
|
|
- # In order to determine what the anomaly is we compute the slope (idle or crash)
|
|
|
- # The slope is computed using the covariance / variance of features
|
|
|
- #
|
|
|
- if r is not None:
|
|
|
- N = len(info['features'])
|
|
|
- xy = ML.Extract(info['features'],xo)
|
|
|
- xy = np.array(xy)
|
|
|
-
|
|
|
- vxy= np.array([ np.var(xy[:,i]) for i in range(0,N)])
|
|
|
- cxy=np.array(info['parameters']['cov'])
|
|
|
- #cxy=np.cov(np.transpose(xy))
|
|
|
- if np.sum(vxy) == 0:
|
|
|
- vxy = cxy
|
|
|
-
|
|
|
- alpha = cxy/vxy
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- r = {"anomaly":r[0][1],"slope":list(alpha[:,0])}
|
|
|
-
|
|
|
- return r
|
|
|
-class Regression:
|
|
|
- parameters = {}
|
|
|
- @staticmethod
|
|
|
- def predict(xo):
|
|
|
- pass
|
|
|
-
|
|
|
- def __init__(self,config):
|
|
|
- pass
|