ml.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. """
  2. This file is intended to perfom certain machine learning tasks based on numpy
  3. We are trying to keep it lean that's why no sklearn involved yet
  4. @TODO:
  5. Create factory method for the learners implemented here
  6. Improve preconditions (size of the dataset, labels)
  7. """
  8. from __future__ import division
  9. import numpy as np
  10. class ML:
  11. @staticmethod
  12. def Filter (attr,value,data) :
  13. #
  14. # @TODO: Make sure this approach works across all transport classes
  15. # We may have a potential issue of how the data is stored ... it may not scale
  16. #
  17. return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
  18. @staticmethod
  19. def Extract(lattr,data):
  20. if isinstance(lattr,basestring):
  21. lattr = [lattr]
  22. return [[row[id] for id in lattr] for row in data]
  23. """
  24. Implements a multivariate anomaly detection
  25. @TODO: determine computationally determine epsilon
  26. """
  27. class AnomalyDetection:
  28. def split(self,data,index=-1,threshold=0.8) :
  29. N = len(data)
  30. # if N < LIMIT:
  31. # return None
  32. end = int(N*threshold)
  33. train = data[:end]
  34. test = data[end:]
  35. return {"train":train,"test":test}
  36. """
  37. @param key field name by which the data will be filtered
  38. @param value field value for the filter
  39. @param features features to be used in the analysis
  40. @param labels used to assess performance
  41. @TODO: Map/Reduce does a good job at filtering
  42. """
  43. def learn(self,data,key,value,features,label):
  44. xo = ML.Filter(key,value,data)
  45. if not xo :
  46. return None
  47. #if len(xo) < 100 :
  48. #return None
  49. # attr = conf['features']
  50. # label= conf['label']
  51. yo= ML.Extract([label['name']],xo)
  52. xo = ML.Extract(features,xo)
  53. yo = self.getLabel(yo,label)
  54. xo = self.split(xo)
  55. yo = self.split(yo)
  56. if xo['train'] :
  57. E = 0.01
  58. for i in range(0,10):
  59. Epsilon = E + (2*E*i)
  60. p = self.gParameters(xo['train'])
  61. px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
  62. perf = self.gPerformance(px,yo['test'])
  63. if perf['fscore'] > 0 :
  64. perf['epsilon'] = Epsilon
  65. break
  66. return {"label":value,"parameters":p,"performance":perf}
  67. return None
  68. def getLabel(self,yo,label_conf):
  69. return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
  70. """
  71. This function will compute the probability density function given a particular event/set of events
  72. The return value is [px,yo]
  73. @pre xu.shape[0] == sigma[0] == sigma[1]
  74. """
  75. def gPx(self,xu,sigma,data,EPSILON=0.01):
  76. n = len(data[0])
  77. r = []
  78. a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
  79. # EPSILON = np.float64(EPSILON)
  80. test = np.array(data)
  81. for row in test:
  82. row = np.array(row)
  83. d = np.matrix(row - xu)
  84. d.shape = (n,1)
  85. b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
  86. px = float(b/a)
  87. r.append([px,int(px < EPSILON)])
  88. return r
  89. """
  90. This function uses stored learnt information to predict on raw data
  91. In this case it will determin if we have an anomaly or not
  92. @param xo raw observations (matrix)
  93. @param info stored information about this
  94. """
  95. def predict(self,xo,info):
  96. xo = ML.Extract(info['features'],xo)
  97. if not xo :
  98. return None
  99. sigma = info['parameters']['cov']
  100. xu = info['parameters']['mean']
  101. epsilon = info['performance']['epsilon']
  102. return self.gPx(xu,sigma,xo,epsilon)
  103. """
  104. This function computes performance metrics i.e precision, recall and f-score
  105. for details visit https://en.wikipedia.org/wiki/Precision_and_recall
  106. """
  107. def gPerformance(self,test,labels) :
  108. N = len(test)
  109. tp = 0 # true positive
  110. fp = 0 # false positive
  111. fn = 0 # false negative
  112. tn = 0 # true negative
  113. for i in range(0,N):
  114. tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
  115. fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
  116. fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
  117. tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
  118. precision = tp / (tp + fp) if tp + fp > 0 else 1
  119. recall = tp / (tp + fn) if tp + fp > 0 else 1
  120. fscore = (2 * precision * recall)/ (precision + recall)
  121. return {"precision":precision,"recall":recall,"fscore":fscore}
  122. """
  123. This function returns gaussian parameters i.e means and covariance
  124. The information will be used to compute probabilities
  125. """
  126. def gParameters(self,train) :
  127. n = len(train[0])
  128. m = np.transpose(np.array(train))
  129. u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
  130. r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
  131. #
  132. #-- Normalizing the matrix then we will compute covariance matrix
  133. #
  134. m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
  135. sigma = np.cov(m)
  136. sigma = [ list(row) for row in sigma]
  137. return {"cov":sigma,"mean":list(u)}
  138. class Regression:
  139. parameters = {}
  140. @staticmethod
  141. def predict(xo):
  142. pass
  143. def __init__(self,config):
  144. pass