8 år sedan · 886a9e1d76
--- a/src/utils/ml.py
+++ b/src/utils/ml.py
@@ -2,6 +2,7 @@
 
				 	This file is intended to perfom certain machine learning tasks based on numpy
			
 
				 	We are trying to keep it lean that's why no sklearn involved yet
			
 
				 """
			
 
				+from __future__ import division
			
 
				 import numpy as np
			
 
				 
			
 
				 class ML:
			
@@ -15,18 +16,91 @@ class ML:
 
				 	@staticmethod
			
 
				 	def Extract(lattr,data):
			
 
				 		return [[row[id] for id in lattr] for row in data]
			
 
				+"""
			
 
				+	Implements a multivariate anomaly detection
			
 
				+	@TODO: determine computationally determine epsilon
			
 
				+"""
			
 
				+class AnomalyDetection:
			
 
				+	def split(self,data,index=-1,threshold=0.7) :
			
 
				+		N	= len(data)
			
 
				+		if N < LIMIT:
			
 
				+			return None
			
 
				+		
			
 
				+		end 	= int(N*threshold)
			
 
				+		train	= data[:end]
			
 
				+		test	= data[end:]
			
 
				+		if index > 0:
			
 
				+		return {"train":train,"test":test,"labels":[]}
			
 
				+	def learn(self,data,conf):
			
 
				+		if 'filter' in conf:
			
 
				+			filter = conf['filter']
			
 
				+			data = ML.Filter(filter['key'],filter['value'],data)
			
 
				+		attr = conf['features']
			
 
				+		label= conf['label']
			
 
				+		labels= ML.Extract([label],data)
			
 
				+		data = ML.Extract(attr,data)
			
 
				+		
			
 
				+		r = self.split(data)
			
 
				+		labels = self.split(labels)
			
 
				+
			
 
				+		p = self.gParameters(r['train'])
			
 
				+		test =  self.gPx(p['mean'],p['cov'],r['test'])
			
 
				+		return self.gPerformance(test,labels['test'])
			
 
				+
			
 
				+
			
 
				+
			
 
				+	"""
			
 
				+		This function will compute the probability density function given a particular event/set of events
			
 
				+		@pre xu.shape[0] == sigma[0] == sigma[1]
			
 
				+	"""
			
 
				+	def gPx(self,xu,sigma,data,EPSILON=0.05):
			
 
				+		n = len(data[0])
			
 
				+		
			
 
				+		r = []
			
 
				+		a  = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
			
 
				+		# EPSILON = np.float64(EPSILON)
			
 
				+		test = np.array(data)
			
 
				+		for row in test:
			
 
				+			row = np.array(row)
			
 
				+			d = np.matrix(row - xu)
			
 
				+			d.shape = (n,1)
			
 
				+			b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
			
 
				+			px = float(b/a)
			
 
				+			r.append([px,int(px < EPSILON)])
			
 
				+		return r
			
 
				+	"""
			
 
				+		This function computes performance metrics i.e precision, recall and f-score
			
 
				+		for details visit https://en.wikipedia.org/wiki/Precision_and_recall
			
 
				+
			
 
				+	"""
			
 
				+	def gPerformance(self,test,labels) :
			
 
				+		N = len(test)
			
 
				+		tp = 0 # true positive
			
 
				+		fp = 0 # false positive
			
 
				+		fn = 0 # false negative
			
 
				+		for i in range(0,N):
			
 
				+			tp += 1 if test[i][1]==labels[i] and test[i][1] == 1
			
 
				+			fp += 1 if test[i][1] != labels[i] and test[i][1] == 1
			
 
				+			fn += 1 if test[i][1] != labels[i] and test[i][1] == 0
			
 
				+		precision = tp / (tp + fp)
			
 
				+		recall	= tp / (tp + fn)
			
 
				+		fscore 	= (2 * precision * recall)/ (precision + recall)
			
 
				+		return {"precision":precision,"recall":recall,"fscore":fscore}
			
 
				+
			
 
				+	"""
			
 
				+		This function returns gaussian parameters i.e means and covariance
			
 
				+		The information will be used to compute probabilities
			
 
				+	"""
			
 
				+	def gParameters(self,train) :
			
 
				 
			
 
				-	def init(self,lattr,data):
			
 
				-		self.lattr = attr
			
 
				-		self.data = data
			
 
				-		self.X = []
			
 
				-		self.Xmeans = []
			
 
				-		for id in lattr:
			
 
				-			xvalues = [item for item in self.data[id]]
			
 
				-			self.Xmeans.append(np.mean(xvalues))
			
 
				-			self.X.append(xvalues)
			
 
				-		slef.Xcov = np.cov(self.X)
			
 
				+		n = len(train[0])
			
 
				+		m = np.transpose(np.array(train))
			
 
				+		
			
 
				+		u = np.array([ np.mean(m[i][:]) for i in range(0,n)])		
			
 
				+		r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
			
 
				 		#
			
 
				-		# Let's get the covariance matrix here ...
			
 
				+		#-- Normalizing the matrix then we will compute covariance matrix
			
 
				 		#
			
 
				-	
			
 
				+		m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
			
 
				+		sigma = np.cov(m)
			
 
				+		return {"cov":sigma,"mean":u}
			
--- a/test/demo.py
+++ b/test/demo.py
@@ -1,8 +1,10 @@
 
				+from __future__ import division
			
 
				 import numpy as np
			
 
				-m = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
			
 
				-m = np.transpose(np.array(m))
			
 
				-xu_ = np.mean(m[1,:])
			
 
				-yu_ = np.mean(m[0,:])
			
 
				+from utils.ml import AnomalyDetection
			
 
				+mo = [[0.0, 4.5], [0.0, 4.5], [11.6, 4.4], [12.2, 4.3], [1.4, 3.9], [1.4, 3.9], [2.5, 3.8], [0.1, 3.8], [0.5, 5.1], [0.7, 5.2], [0.7, 5.1], [0.0, 4.6], [0.0, 4.6]]
			
 
				+m = np.transpose(np.array(mo))
			
 
				+xu_ = np.mean(m[0,:])
			
 
				+yu_ = np.mean(m[1,:])
			
 
				 
			
 
				 xr_ = np.sqrt(np.var(m[0,:]))
			
 
				 yr_ = np.sqrt(np.var(m[1,:]))
			
@@ -10,21 +12,34 @@ yr_ = np.sqrt(np.var(m[1,:]))
 
				 # -- normalizing the matrix before computing covariance
			
 
				 #
			
 
				 mn = np.array([list( (m[0,:]-xu_)/xr_),list( (m[1,:]-yu_)/yr_)])
			
 
				+
			
 
				 cx = np.cov(mn)
			
 
				 n = m.shape[0]
			
 
				-x = np.array([2.4,3.1])
			
 
				+test=[2.4,3.1]
			
 
				+x = np.array(test)
			
 
				 u = np.array([xu_,yu_])
			
 
				+
			
 
				 d = np.matrix(x - u)
			
 
				 d.shape = (n,1)
			
 
				 a  = (2*(np.pi)**(n/2))*np.linalg.det(cx)**0.5
			
 
				 b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(cx)*d))
			
 
				+print u.shape
			
 
				+print cx.shape
			
 
				 
			
 
				 from scipy.stats import multivariate_normal
			
 
				 xo= multivariate_normal.pdf(x,u,cx)
			
 
				 yo= (b/a)[0,0]
			
 
				-e= 0.001
			
 
				+e= np.float64(0.05)
			
 
				 print [yo,yo < e]
			
 
				 print [xo,xo < e]
			
 
				+ml = AnomalyDetection()
			
 
				+end = int(len(mo)*.7)
			
 
				+mu,sigma = ml.gParameters(mo)
			
 
				+r =  ml.gPx(mu,sigma,[test],0.05)
			
 
				+for i in range(0,len(r)) :
			
 
				+	print ' *** ', mo[(i+end)],r[i]
			
 
				+
			
 
				+
			
 
				 #for row in np.transpose(m):
			
 
				 #	print ",".join([str(value) for value in row])
			
 
				 #-- We are ready to perform anomaly detection ...