ml.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. """
  2. This file is intended to perfom certain machine learning tasks based on numpy
  3. We are trying to keep it lean that's why no sklearn involved yet
  4. @TODO:
  5. Create factory method for the learners implemented here
  6. Improve preconditions (size of the dataset, labels)
  7. """
  8. from __future__ import division
  9. import numpy as np
  10. class ML:
  11. @staticmethod
  12. def Filter (attr,value,data) :
  13. #
  14. # @TODO: Make sure this approach works across all transport classes
  15. # We may have a potential issue of how the data is stored ... it may not scale
  16. #
  17. value = ML.CleanupName(value)
  18. #return [item[0] for item in data if item and attr in item[0] and item[0][attr] == value]
  19. #return [[item for item in row if item[attr] == value][0] for row in data]
  20. #
  21. # We are making the filtering more rescillient, i.e if an item doesn't exist we don't have to throw an exception
  22. # This is why we expanded the loops ... fully expressive but rescilient
  23. #
  24. r = []
  25. for row in data :
  26. for item in row :
  27. if attr in item and item[attr] == value:
  28. r.append(item)
  29. return r
  30. @staticmethod
  31. def Extract(lattr,data):
  32. if isinstance(lattr,basestring):
  33. lattr = [lattr]
  34. return [[row[id] for id in lattr] for row in data]
  35. @staticmethod
  36. def CleanupName(value) :
  37. return value.replace('$','').replace('.+','')
  38. """
  39. Implements a multivariate anomaly detection
  40. @TODO: determine computationally determine epsilon
  41. """
  42. class AnomalyDetection:
  43. def split(self,data,index=-1,threshold=0.65) :
  44. N = len(data)
  45. # if N < LIMIT:
  46. # return None
  47. end = int(N*threshold)
  48. train = data[:end]
  49. test = data[end:]
  50. return {"train":train,"test":test}
  51. """
  52. @param key field name by which the data will be filtered
  53. @param value field value for the filter
  54. @param features features to be used in the analysis
  55. @param labels used to assess performance
  56. @TODO: Map/Reduce does a good job at filtering
  57. """
  58. def learn(self,data,key,value,features,label):
  59. if len(data) < 10:
  60. return None
  61. xo = ML.Filter(key,value,data)
  62. if len(xo) < 10 :
  63. return None
  64. # attr = conf['features']
  65. # label= conf['label']
  66. yo= ML.Extract([label['name']],xo)
  67. xo = ML.Extract(features,xo)
  68. yo = self.getLabel(yo,label)
  69. #
  70. # @TODO: Insure this can be finetuned, training size matters for learning. It's not obvious to define upfront
  71. #
  72. xo = self.split(xo)
  73. yo = self.split(yo)
  74. p = self.gParameters(xo['train'])
  75. has_cov = np.linalg.det(p['cov']) if p else False #-- making sure the matrix is invertible
  76. if xo['train'] and has_cov :
  77. E = 0.001
  78. ACCEPTABLE_FSCORE = 0.6
  79. fscore = 0
  80. #
  81. # We need to find an appropriate epsilon for the predictions
  82. # The appropriate epsilon is one that yields an f-score [0.5,1[
  83. #
  84. __operf__ = None
  85. perf = None
  86. for i in range(0,10):
  87. Epsilon = E + (2*E*i)
  88. if p is None :
  89. return None
  90. #
  91. # At this point we've got enough data for the parameters
  92. # We should try to fine tune epsilon for better results
  93. #
  94. px = self.gPx(p['mean'],p['cov'],xo['test'],Epsilon)
  95. __operf__ = self.gPerformance(px,yo['test'])
  96. if __operf__['fscore'] == 1 :
  97. continue
  98. if perf is None :
  99. perf = __operf__
  100. elif perf['fscore'] < __operf__['fscore'] and __operf__['fscore'] > ACCEPTABLE_FSCORE :
  101. perf = __operf__
  102. perf['epsilon'] = Epsilon
  103. #
  104. # At this point we are assuming we came out of the whole thing with an acceptable performance
  105. # The understanding is that error drives performance thus we reject fscore==1
  106. #
  107. if perf and perf['fscore'] > ACCEPTABLE_FSCORE :
  108. return {"label":value,"parameters":p,"performance":perf}
  109. else:
  110. return None
  111. return None
  112. """
  113. This function determines if the preconditions for learning are met
  114. For that parameters are passed to the function
  115. p
  116. """
  117. def canLearn(self,p) :
  118. pass
  119. def getLabel(self,yo,label_conf):
  120. return [ int(len(set(item) & set(label_conf["1"]))>0) for item in yo ]
  121. """
  122. This function will compute the probability density function given a particular event/set of events
  123. The return value is [px,yo]
  124. @pre xu.shape[0] == sigma[0] == sigma[1]
  125. """
  126. def gPx(self,xu,sigma,data,EPSILON=0.01):
  127. n = len(data[0])
  128. r = []
  129. a = (2*(np.pi)**(n/2))*np.linalg.det(sigma)**0.5
  130. # EPSILON = np.float64(EPSILON)
  131. test = np.array(data)
  132. for row in test:
  133. row = np.array(row)
  134. d = np.matrix(row - xu)
  135. d.shape = (n,1)
  136. b = np.exp((-0.5*np.transpose(d)) * (np.linalg.inv(sigma)*d))
  137. px = float(b/a)
  138. r.append([px,int(px < EPSILON)])
  139. return r
  140. """
  141. This function uses stored learnt information to predict on raw data
  142. In this case it will determin if we have an anomaly or not
  143. @param xo raw observations (matrix)
  144. @param info stored information about this
  145. """
  146. def predict(self,xo,info):
  147. xo = ML.Extract(info['features'],xo)
  148. if not xo :
  149. return None
  150. sigma = info['parameters']['cov']
  151. xu = info['parameters']['mean']
  152. epsilon = info['performance']['epsilon']
  153. return self.gPx(xu,sigma,xo,epsilon)
  154. """
  155. This function computes performance metrics i.e precision, recall and f-score
  156. for details visit https://en.wikipedia.org/wiki/Precision_and_recall
  157. """
  158. def gPerformance(self,test,labels) :
  159. N = len(test)
  160. tp = 0 # true positive
  161. fp = 0 # false positive
  162. fn = 0 # false negative
  163. tn = 0 # true negative
  164. for i in range(0,N):
  165. tp += 1 if (test[i][1]==labels[i] and test[i][1] == 1) else 0
  166. fp += 1 if (test[i][1] != labels[i] and test[i][1] == 1) else 0
  167. fn += 1 if (test[i][1] != labels[i] and test[i][1] == 0) else 0
  168. tn += 1 if (test[i][1] == labels[i] and test[i][1] == 0) else 0
  169. precision = tp / (tp + fp) if tp + fp > 0 else 1
  170. recall = tp / (tp + fn) if tp + fp > 0 else 1
  171. fscore = (2 * precision * recall)/ (precision + recall)
  172. return {"precision":precision,"recall":recall,"fscore":fscore}
  173. """
  174. This function returns gaussian parameters i.e means and covariance
  175. The information will be used to compute probabilities
  176. """
  177. def gParameters(self,train) :
  178. n = len(train[0])
  179. m = np.transpose(np.array(train))
  180. u = np.array([ np.mean(m[i][:]) for i in range(0,n)])
  181. if np.sum(u) == 0:
  182. return None
  183. r = np.array([ np.sqrt(np.var(m[i,:])) for i in range(0,n)])
  184. #
  185. # Before we normalize the data we must insure there's is some level of movement in this application
  186. # A lack of movement suggests we may not bave enough information to do anything
  187. #
  188. if 0 in r :
  189. return None
  190. #
  191. #-- Normalizing the matrix then we will compute covariance matrix
  192. #
  193. m = np.array([ (m[i,:] - u[i])/r[i] for i in range(0,n)])
  194. sigma = np.cov(m)
  195. sigma = [ list(row) for row in sigma]
  196. return {"cov":sigma,"mean":list(u)}
  197. class AnalyzeAnomalies(AnomalyDetection):
  198. """
  199. This analysis function will include a predicted status because an anomaly can either be
  200. - A downtime i.e end of day
  201. - A spike and thus a potential imminent crash
  202. @param xo matrix of variables
  203. @param info information about what was learnt
  204. """
  205. def predict(self,xo,info):
  206. x = xo[len(xo)-1]
  207. r = AnomalyDetection.predict(x,info)
  208. #
  209. # In order to determine what the anomaly is we compute the slope (idle or crash)
  210. # The slope is computed using the covariance / variance of features
  211. #
  212. N = len(info['features'])
  213. xy = ML.Extract(info['features'],xo)
  214. xy = np.matrix(xy)
  215. vxy= [xy[:,i] for i in range(0,N)]
  216. print N,vxy.shape
  217. alpha = info['cov'] / vxy
  218. return r
  219. class Regression:
  220. parameters = {}
  221. @staticmethod
  222. def predict(xo):
  223. pass
  224. def __init__(self,config):
  225. pass