Jelajahi Sumber

finished up anomaly detection, with precision/recall @TODO: testinggit stage ../src/utils/ml.py demo.py

Steve L. Nyemba 8 tahun lalu
induk
melakukan
886a9e1d76
2 mengubah file dengan 107 tambahan dan 18 penghapusan
  1. 86 12
      src/utils/ml.py
  2. 21 6
      test/demo.py

+ 86 - 12
src/utils/ml.py

@@ -2,6 +2,7 @@
 	This file is intended to perfom certain machine learning tasks based on numpy
 	We are trying to keep it lean that's why no sklearn involved yet
 """
+from __future__ import division
 import numpy as np
 
 class ML:
@@ -15,18 +16,91 @@ class ML:
 	@staticmethod
 	def Extract(lattr,data):
 		return [[row[id] for id in lattr] for row in data]
+"""
+	Implements a multivariate anomaly detection
+	@TODO: determine computationally determine epsilon
+"""
+class AnomalyDetection:
+	def split(self,data,index=-1,threshold=0.7) :
+		N	= len(data)
+		if N < LIMIT:
+			return None
+		
+		end 	= int(N*threshold)
+		train	= data[:end]
+		test	= data[end:]
+		if index > 0:
+		return {"train":train,"test":test,"labels":[]}
+	def learn(self,data,conf):
+		if 'filter' in conf:
+			filter = conf['filter']
+			data = ML.Filter(filter['key'],filter['value'],data)
+		attr = conf['features']
+		label= conf['label']
+		labels= ML.Extract([label],data)
+		data = ML.Extract(attr,data)
+		
+		r = self.split(data)
+		labels = self.split(labels)
+
+		p = self.gParameters(r['train'])
+		test =  self.gPx(p['mean'],p['cov'],r['test'])
+		return self.gPerformance(test,labels['test'])
+
+
+
+	"""
+		This function will compute the probability density function given a particular event/set of events
+		@pre xu.shape[0] == sigma[0] == sigma[1]
+	"""
+	def gPx(self,xu,sigma,data,EPSILON=0.05):
+		n = len(data[0])
+		
+		r = []
+		a  = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
+		# EPSILON = np.float64(EPSILON)
+		test = np.array(data)
+		for row in test:
+			row = np.array(row)
+			d = np.matrix(row - xu)
+			d.shape = (n,1)
+			b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
+			px = float(b/a)
+			r.append([px,int(px < EPSILON)])
+		return r
+	"""
+		This function computes performance metrics i.e precision, recall and f-score
+		for details visit https://en.wikipedia.org/wiki/Precision_and_recall
+
+	"""
+	def gPerformance(self,test,labels) :
+		N = len(test)
+		tp = 0 # true positive
+		fp = 0 # false positive
+		fn = 0 # false negative
+		for i in range(0,N):
+			tp += 1 if test[i][1]==labels[i] and test[i][1] == 1
+			fp += 1 if test[i][1] != labels[i] and test[i][1] == 1
+			fn += 1 if test[i][1] != labels[i] and test[i][1] == 0
+		precision = tp / (tp + fp)
+		recall	= tp / (tp + fn)
+		fscore 	= (2 * precision * recall)/ (precision + recall)
+		return {"precision":precision,"recall":recall,"fscore":fscore}
+
+	"""
+		This function returns gaussian parameters i.e means and covariance
+		The information will be used to compute probabilities
+	"""
+	def gParameters(self,train) :
 
-	def init(self,lattr,data):
-		self.lattr = attr
-		self.data = data
-		self.X = []
-		self.Xmeans = []
-		for id in lattr:
-			xvalues = [item for item in self.data[id]]
-			self.Xmeans.append(np.mean(xvalues))
-			self.X.append(xvalues)
-		slef.Xcov = np.cov(self.X)
+		n = len(train[0])
+		m = np.transpose(np.array(train))
+		
+		u = np.array([ np.mean(m[i][:]) for i in range(0,n)])		
+		r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
 		#
-		# Let's get the covariance matrix here ...
+		#-- Normalizing the matrix then we will compute covariance matrix
 		#
-	
+		m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
+		sigma = np.cov(m)
+		return {"cov":sigma,"mean":u}

+ 21 - 6
test/demo.py

@@ -1,8 +1,10 @@
+from __future__ import division
 import numpy as np
-m = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
-m = np.transpose(np.array(m))
-xu_ = np.mean(m[1,:])
-yu_ = np.mean(m[0,:])
+from utils.ml import AnomalyDetection
+mo = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
+m = np.transpose(np.array(mo))
+xu_ = np.mean(m[0,:])
+yu_ = np.mean(m[1,:])
 
 xr_ = np.sqrt(np.var(m[0,:]))
 yr_ = np.sqrt(np.var(m[1,:]))
@@ -10,21 +12,34 @@ yr_ = np.sqrt(np.var(m[1,:]))
 # -- normalizing the matrix before computing covariance
 #
 mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)])
+
 cx = np.cov(mn)
 n = m.shape[0]
-x = np.array([2.4,3.1])
+test=[2.4,3.1]
+x = np.array(test)
 u = np.array([xu_,yu_])
+
 d = np.matrix(x - u)
 d.shape = (n,1)
 a  = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5
 b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d))
+print u.shape
+print cx.shape
 
 from scipy.stats import multivariate_normal
 xo= multivariate_normal.pdf(x,u,cx)
 yo= (b/a)[0,0]
-e= 0.001
+e= np.float64(0.05)
 print [yo,yo < e]
 print [xo,xo < e]
+ml = AnomalyDetection()
+end = int(len(mo)*.7)
+mu,sigma = ml.gParameters(mo)
+r =  ml.gPx(mu,sigma,[test],0.05)
+for i in range(0,len(r)) :
+	print ' *** ', mo[(i+end)],r[i]
+
+
 #for row in np.transpose(m):
 #	print ",".join([str(value) for value in row])
 #-- We are ready to perform anomaly detection ...