|
@@ -2,6 +2,7 @@
|
|
|
This file is intended to perfom certain machine learning tasks based on numpy
|
|
|
We are trying to keep it lean that's why no sklearn involved yet
|
|
|
"""
|
|
|
+from __future__ import division
|
|
|
import numpy as np
|
|
|
|
|
|
class ML:
|
|
@@ -15,18 +16,91 @@ class ML:
|
|
|
@staticmethod
|
|
|
def Extract(lattr,data):
|
|
|
return [[row[id] for id in lattr] for row in data]
|
|
|
+"""
|
|
|
+ Implements a multivariate anomaly detection
|
|
|
+ @TODO: determine computationally determine epsilon
|
|
|
+"""
|
|
|
+class AnomalyDetection:
|
|
|
+ def split(self,data,index=-1,threshold=0.7) :
|
|
|
+ N = len(data)
|
|
|
+ if N < LIMIT:
|
|
|
+ return None
|
|
|
+
|
|
|
+ end = int(N*threshold)
|
|
|
+ train = data[:end]
|
|
|
+ test = data[end:]
|
|
|
+ if index > 0:
|
|
|
+ return {"train":train,"test":test,"labels":[]}
|
|
|
+ def learn(self,data,conf):
|
|
|
+ if 'filter' in conf:
|
|
|
+ filter = conf['filter']
|
|
|
+ data = ML.Filter(filter['key'],filter['value'],data)
|
|
|
+ attr = conf['features']
|
|
|
+ label= conf['label']
|
|
|
+ labels= ML.Extract([label],data)
|
|
|
+ data = ML.Extract(attr,data)
|
|
|
+
|
|
|
+ r = self.split(data)
|
|
|
+ labels = self.split(labels)
|
|
|
+
|
|
|
+ p = self.gParameters(r['train'])
|
|
|
+ test = self.gPx(p['mean'],p['cov'],r['test'])
|
|
|
+ return self.gPerformance(test,labels['test'])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ """
|
|
|
+ This function will compute the probability density function given a particular event/set of events
|
|
|
+ @pre xu.shape[0] == sigma[0] == sigma[1]
|
|
|
+ """
|
|
|
+ def gPx(self,xu,sigma,data,EPSILON=0.05):
|
|
|
+ n = len(data[0])
|
|
|
+
|
|
|
+ r = []
|
|
|
+ a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
|
|
|
+ # EPSILON = np.float64(EPSILON)
|
|
|
+ test = np.array(data)
|
|
|
+ for row in test:
|
|
|
+ row = np.array(row)
|
|
|
+ d = np.matrix(row - xu)
|
|
|
+ d.shape = (n,1)
|
|
|
+ b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
|
|
|
+ px = float(b/a)
|
|
|
+ r.append([px,int(px < EPSILON)])
|
|
|
+ return r
|
|
|
+ """
|
|
|
+ This function computes performance metrics i.e precision, recall and f-score
|
|
|
+ for details visit https://en.wikipedia.org/wiki/Precision_and_recall
|
|
|
+
|
|
|
+ """
|
|
|
+ def gPerformance(self,test,labels) :
|
|
|
+ N = len(test)
|
|
|
+ tp = 0 # true positive
|
|
|
+ fp = 0 # false positive
|
|
|
+ fn = 0 # false negative
|
|
|
+ for i in range(0,N):
|
|
|
+ tp += 1 if test[i][1]==labels[i] and test[i][1] == 1
|
|
|
+ fp += 1 if test[i][1] != labels[i] and test[i][1] == 1
|
|
|
+ fn += 1 if test[i][1] != labels[i] and test[i][1] == 0
|
|
|
+ precision = tp / (tp + fp)
|
|
|
+ recall = tp / (tp + fn)
|
|
|
+ fscore = (2 * precision * recall)/ (precision + recall)
|
|
|
+ return {"precision":precision,"recall":recall,"fscore":fscore}
|
|
|
+
|
|
|
+ """
|
|
|
+ This function returns gaussian parameters i.e means and covariance
|
|
|
+ The information will be used to compute probabilities
|
|
|
+ """
|
|
|
+ def gParameters(self,train) :
|
|
|
|
|
|
- def init(self,lattr,data):
|
|
|
- self.lattr = attr
|
|
|
- self.data = data
|
|
|
- self.X = []
|
|
|
- self.Xmeans = []
|
|
|
- for id in lattr:
|
|
|
- xvalues = [item for item in self.data[id]]
|
|
|
- self.Xmeans.append(np.mean(xvalues))
|
|
|
- self.X.append(xvalues)
|
|
|
- slef.Xcov = np.cov(self.X)
|
|
|
+ n = len(train[0])
|
|
|
+ m = np.transpose(np.array(train))
|
|
|
+
|
|
|
+ u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
|
|
|
+ r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
|
|
|
#
|
|
|
- # Let's get the covariance matrix here ...
|
|
|
+ #-- Normalizing the matrix then we will compute covariance matrix
|
|
|
#
|
|
|
-
|
|
|
+ m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
|
|
|
+ sigma = np.cov(m)
|
|
|
+ return {"cov":sigma,"mean":u}
|