123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- """
- This file is intended to perfom certain machine learning tasks based on numpy
- We are trying to keep it lean that's why no sklearn involved yet
- @TODO:
- Create factory method for the learners implemented here
- Improve preconditions (size of the dataset, labels)
- """
- from __future__ import division
- import numpy as np
- class ML:
- @staticmethod
- def Filter (attr,value,data) :
- #
- # @TODO: Make sure this approach works across all transport classes
- # We may have a potential issue of how the data is stored ... it may not scale
- #
-
- return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
- @staticmethod
- def Extract(lattr,data):
- if isinstance(lattr,basestring):
- lattr = [lattr]
- return [[row[id] for id in lattr] for row in data]
-
- """
- Implements a multivariate anomaly detection
- @TODO: determine computationally determine epsilon
- """
- class AnomalyDetection:
-
- def split(self,data,index=-1,threshold=0.8) :
- N = len(data)
- # if N < LIMIT:
- # return None
-
- end = int(N*threshold)
- train = data[:end]
- test = data[end:]
-
- return {"train":train,"test":test}
- """
- @param key field name by which the data will be filtered
- @param value field value for the filter
- @param features features to be used in the analysis
- @param labels used to assess performance
- @TODO: Map/Reduce does a good job at filtering
- """
- def learn(self,data,key,value,features,label):
- xo = ML.Filter(key,value,data)
-
- if not xo :
- return None
-
- #if len(xo) < 100 :
- #return None
- # attr = conf['features']
- # label= conf['label']
-
- yo= ML.Extract([label['name']],xo)
- xo = ML.Extract(features,xo)
- yo = self.getLabel(yo,label)
-
- xo = self.split(xo)
- yo = self.split(yo)
- if xo['train'] :
- E = 0.01
- for i in range(0,10):
- Epsilon = E + (2*E*i)
- p = self.gParameters(xo['train'])
-
- px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
-
- perf = self.gPerformance(px,yo['test'])
- if perf['fscore'] > 0 :
-
- perf['epsilon'] = Epsilon
-
- break
-
- return {"label":value,"parameters":p,"performance":perf}
- return None
- def getLabel(self,yo,label_conf):
- return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
- """
- This function will compute the probability density function given a particular event/set of events
- The return value is [px,yo]
- @pre xu.shape[0] == sigma[0] == sigma[1]
- """
- def gPx(self,xu,sigma,data,EPSILON=0.01):
- n = len(data[0])
-
- r = []
- a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
- # EPSILON = np.float64(EPSILON)
- test = np.array(data)
- for row in test:
- row = np.array(row)
- d = np.matrix(row - xu)
- d.shape = (n,1)
- b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
-
- px = float(b/a)
- r.append([px,int(px < EPSILON)])
- return r
- """
- This function uses stored learnt information to predict on raw data
- In this case it will determin if we have an anomaly or not
- @param xo raw observations (matrix)
- @param info stored information about this
- """
- def predict(self,xo,info):
-
- xo = ML.Extract(info['features'],xo)
-
- if not xo :
- return None
-
- sigma = info['parameters']['cov']
- xu = info['parameters']['mean']
- epsilon = info['performance']['epsilon']
- return self.gPx(xu,sigma,xo,epsilon)
- """
- This function computes performance metrics i.e precision, recall and f-score
- for details visit https://en.wikipedia.org/wiki/Precision_and_recall
- """
- def gPerformance(self,test,labels) :
- N = len(test)
- tp = 0 # true positive
- fp = 0 # false positive
- fn = 0 # false negative
- tn = 0 # true negative
- for i in range(0,N):
- tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
- fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
- fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
- tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
- precision = tp / (tp + fp) if tp + fp > 0 else 1
- recall = tp / (tp + fn) if tp + fp > 0 else 1
- fscore = (2 * precision * recall)/ (precision + recall)
- return {"precision":precision,"recall":recall,"fscore":fscore}
- """
- This function returns gaussian parameters i.e means and covariance
- The information will be used to compute probabilities
- """
- def gParameters(self,train) :
- n = len(train[0])
- m = np.transpose(np.array(train))
-
- u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
- r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
- #
- #-- Normalizing the matrix then we will compute covariance matrix
- #
- m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
- sigma = np.cov(m)
- sigma = [ list(row) for row in sigma]
- return {"cov":sigma,"mean":list(u)}
- class Regression:
- parameters = {}
- @staticmethod
- def predict(xo):
- pass
-
- def __init__(self,config):
- pass
|