浏览代码

finished up anomaly detection, with precision/recall @TODO: testinggit stage ../src/utils/ml.py demo.py

Steve L. Nyemba 8 年之前
父节点
当前提交
886a9e1d76
共有 2 个文件被更改,包括 107 次插入18 次删除
  1. 86 12
      src/utils/ml.py
  2. 21 6
      test/demo.py

+ 86 - 12
src/utils/ml.py

@@ -2,6 +2,7 @@
 	This file is intended to perfom certain machine learning tasks based on numpy
 	This file is intended to perfom certain machine learning tasks based on numpy
 	We are trying to keep it lean that's why no sklearn involved yet
 	We are trying to keep it lean that's why no sklearn involved yet
 """
 """
+from __future__ import division
 import numpy as np
 import numpy as np
 
 
 class ML:
 class ML:
@@ -15,18 +16,91 @@ class ML:
 	@staticmethod
 	@staticmethod
 	def Extract(lattr,data):
 	def Extract(lattr,data):
 		return [[row[id] for id in lattr] for row in data]
 		return [[row[id] for id in lattr] for row in data]
+"""
+	Implements a multivariate anomaly detection
+	@TODO: determine computationally determine epsilon
+"""
+class AnomalyDetection:
+	def split(self,data,index=-1,threshold=0.7) :
+		N	= len(data)
+		if N < LIMIT:
+			return None
+		
+		end 	= int(N*threshold)
+		train	= data[:end]
+		test	= data[end:]
+		if index > 0:
+		return {"train":train,"test":test,"labels":[]}
+	def learn(self,data,conf):
+		if 'filter' in conf:
+			filter = conf['filter']
+			data = ML.Filter(filter['key'],filter['value'],data)
+		attr = conf['features']
+		label= conf['label']
+		labels= ML.Extract([label],data)
+		data = ML.Extract(attr,data)
+		
+		r = self.split(data)
+		labels = self.split(labels)
+
+		p = self.gParameters(r['train'])
+		test =  self.gPx(p['mean'],p['cov'],r['test'])
+		return self.gPerformance(test,labels['test'])
+
+
+
+	"""
+		This function will compute the probability density function given a particular event/set of events
+		@pre xu.shape[0] == sigma[0] == sigma[1]
+	"""
+	def gPx(self,xu,sigma,data,EPSILON=0.05):
+		n = len(data[0])
+		
+		r = []
+		a  = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
+		# EPSILON = np.float64(EPSILON)
+		test = np.array(data)
+		for row in test:
+			row = np.array(row)
+			d = np.matrix(row - xu)
+			d.shape = (n,1)
+			b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
+			px = float(b/a)
+			r.append([px,int(px < EPSILON)])
+		return r
+	"""
+		This function computes performance metrics i.e precision, recall and f-score
+		for details visit https://en.wikipedia.org/wiki/Precision_and_recall
+
+	"""
+	def gPerformance(self,test,labels) :
+		N = len(test)
+		tp = 0 # true positive
+		fp = 0 # false positive
+		fn = 0 # false negative
+		for i in range(0,N):
+			tp += 1 if test[i][1]==labels[i] and test[i][1] == 1
+			fp += 1 if test[i][1] != labels[i] and test[i][1] == 1
+			fn += 1 if test[i][1] != labels[i] and test[i][1] == 0
+		precision = tp / (tp + fp)
+		recall	= tp / (tp + fn)
+		fscore 	= (2 * precision * recall)/ (precision + recall)
+		return {"precision":precision,"recall":recall,"fscore":fscore}
+
+	"""
+		This function returns gaussian parameters i.e means and covariance
+		The information will be used to compute probabilities
+	"""
+	def gParameters(self,train) :
 
 
-	def init(self,lattr,data):
-		self.lattr = attr
-		self.data = data
-		self.X = []
-		self.Xmeans = []
-		for id in lattr:
-			xvalues = [item for item in self.data[id]]
-			self.Xmeans.append(np.mean(xvalues))
-			self.X.append(xvalues)
-		slef.Xcov = np.cov(self.X)
+		n = len(train[0])
+		m = np.transpose(np.array(train))
+		
+		u = np.array([ np.mean(m[i][:]) for i in range(0,n)])		
+		r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
 		#
 		#
-		# Let's get the covariance matrix here ...
+		#-- Normalizing the matrix then we will compute covariance matrix
 		#
 		#
-	
+		m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
+		sigma = np.cov(m)
+		return {"cov":sigma,"mean":u}

+ 21 - 6
test/demo.py

@@ -1,8 +1,10 @@
+from __future__ import division
 import numpy as np
 import numpy as np
-m = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
-m = np.transpose(np.array(m))
-xu_ = np.mean(m[1,:])
-yu_ = np.mean(m[0,:])
+from utils.ml import AnomalyDetection
+mo = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
+m = np.transpose(np.array(mo))
+xu_ = np.mean(m[0,:])
+yu_ = np.mean(m[1,:])
 
 
 xr_ = np.sqrt(np.var(m[0,:]))
 xr_ = np.sqrt(np.var(m[0,:]))
 yr_ = np.sqrt(np.var(m[1,:]))
 yr_ = np.sqrt(np.var(m[1,:]))
@@ -10,21 +12,34 @@ yr_ = np.sqrt(np.var(m[1,:]))
 # -- normalizing the matrix before computing covariance
 # -- normalizing the matrix before computing covariance
 #
 #
 mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)])
 mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)])
+
 cx = np.cov(mn)
 cx = np.cov(mn)
 n = m.shape[0]
 n = m.shape[0]
-x = np.array([2.4,3.1])
+test=[2.4,3.1]
+x = np.array(test)
 u = np.array([xu_,yu_])
 u = np.array([xu_,yu_])
+
 d = np.matrix(x - u)
 d = np.matrix(x - u)
 d.shape = (n,1)
 d.shape = (n,1)
 a  = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5
 a  = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5
 b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d))
 b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d))
+print u.shape
+print cx.shape
 
 
 from scipy.stats import multivariate_normal
 from scipy.stats import multivariate_normal
 xo= multivariate_normal.pdf(x,u,cx)
 xo= multivariate_normal.pdf(x,u,cx)
 yo= (b/a)[0,0]
 yo= (b/a)[0,0]
-e= 0.001
+e= np.float64(0.05)
 print [yo,yo < e]
 print [yo,yo < e]
 print [xo,xo < e]
 print [xo,xo < e]
+ml = AnomalyDetection()
+end = int(len(mo)*.7)
+mu,sigma = ml.gParameters(mo)
+r =  ml.gPx(mu,sigma,[test],0.05)
+for i in range(0,len(r)) :
+	print ' *** ', mo[(i+end)],r[i]
+
+
 #for row in np.transpose(m):
 #for row in np.transpose(m):
 #	print ",".join([str(value) for value in row])
 #	print ",".join([str(value) for value in row])
 #-- We are ready to perform anomaly detection ...
 #-- We are ready to perform anomaly detection ...