risk.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. """
  2. Health Information Privacy Lab
  3. Brad. Malin, Weiyi Xia, Steve L. Nyemba
  4. This framework computes re-identification risk of a dataset assuming the data being shared can be loaded into a dataframe (pandas)
  5. The framework will compute the following risk measures:
  6. - marketer
  7. - prosecutor
  8. - pitman
  9. References :
  10. https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
  11. This framework integrates pandas (for now) as an extension and can be used in two modes :
  12. Experimental mode
  13. Here the assumption is that we are not sure of the attributes to be disclosed, the framework will explore a variety of combinations and associate risk measures every random combinations
  14. Evaluation mode
  15. The evaluation mode assumes the set of attributes given are known and thus will evaluate risk for a subset of attributes.
  16. features :
  17. - determine viable fields (quantifiable in terms of uniqueness). This is a way to identify fields that can act as identifiers.
  18. - explore and evaluate risk of a sample dataset against a known population dataset
  19. - explore and evaluate risk on a sample dataset
  20. Usage:
  21. from pandas_risk import *
  22. mydataframe = pd.DataFrame('/myfile.csv')
  23. resp = mydataframe.risk.evaluate(id=<name of patient field>,num_runs=<number of runs>,cols=[])
  24. resp = mydataframe.risk.explore(id=<name of patient field>,num_runs=<number of runs>,cols=[])
  25. @TODO:
  26. - Provide a selected number of fields and risk will be computed for those fields.
  27. - include journalist risk
  28. """
  29. import pandas as pd
  30. import numpy as np
  31. import logging
  32. import json
  33. from datetime import datetime
  34. import sys
  35. @pd.api.extensions.register_dataframe_accessor("risk")
  36. class deid :
  37. """
  38. This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
  39. """
  40. def __init__(self,df):
  41. self._df = df.fillna(' ')
  42. def explore(self,**args):
  43. """
  44. This function will perform experimentation by performing a random policies (combinations of attributes)
  45. This function is intended to explore a variety of policies and evaluate their associated risk.
  46. @param pop|sample data-frame with popublation reference
  47. @param id key field that uniquely identifies patient/customer ...
  48. """
  49. pop= args['pop'] if 'pop' in args else None
  50. if 'pop_size' in args :
  51. pop_size = np.float64(args['pop_size'])
  52. else:
  53. pop_size = -1
  54. #
  55. # Policies will be generated with a number of runs
  56. #
  57. RUNS = args['num_runs'] if 'num_runs' in args else 5
  58. sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
  59. k = sample.columns.size if 'field_count' not in args else int(args['field_count']) + 1
  60. if 'id' in args :
  61. id = args['id']
  62. columns = list(set(sample.columns.tolist()) - set([id]))
  63. else:
  64. columns = sample.columns.tolist()
  65. o = pd.DataFrame()
  66. for i in np.arange(RUNS):
  67. n = np.random.randint(2,k)
  68. cols = np.random.choice(columns,n,replace=False).tolist()
  69. params = {'sample':sample,'cols':cols}
  70. if pop is not None :
  71. params['pop'] = pop
  72. if pop_size > 0 :
  73. params['pop_size'] = pop_size
  74. r = self.evaluate(**params)
  75. #
  76. # let's put the policy in place
  77. p = pd.DataFrame(1*sample.columns.isin(cols)).T
  78. p.columns = sample.columns
  79. o = o.append(r.join(p))
  80. o.index = np.arange(o.shape[0]).astype(np.int64)
  81. return o
  82. def evaluate(self, **args):
  83. """
  84. This function has the ability to evaluate risk associated with either a population or a sample dataset
  85. :sample sample dataset
  86. :pop population dataset
  87. :cols list of columns of interest or policies
  88. :flag user provided flag for the context of the evaluation
  89. """
  90. if 'sample' in args :
  91. sample = pd.DataFrame(args['sample'])
  92. else:
  93. sample = pd.DataFrame(self._df)
  94. if not args or 'cols' not in args:
  95. cols = sample.columns.tolist()
  96. elif args and 'cols' in args:
  97. cols = args['cols']
  98. flag = 'UNFLAGGED' if 'flag' not in args else args['flag']
  99. #
  100. # @TODO: auto select the columns i.e removing the columns that will have the effect of an identifier
  101. #
  102. # if 'population' in args :
  103. # pop = pd.DataFrame(args['population'])
  104. r = {"flag":flag}
  105. # if sample :
  106. handle_sample = Sample()
  107. xi = sample.groupby(cols,as_index=False).size().values
  108. handle_sample.set('groups',xi)
  109. if 'pop_size' in args :
  110. pop_size = np.float64(args['pop_size'])
  111. else:
  112. pop_size = -1
  113. #
  114. #-- The following conditional line is to address the labels that will be returned
  115. # @TODO: Find a more elegant way of doing this.
  116. #
  117. if 'pop' in args :
  118. r['sample marketer'] = handle_sample.marketer()
  119. r['sample prosecutor'] = handle_sample.prosecutor()
  120. r['sample unique ratio'] = handle_sample.unique_ratio()
  121. r['sample group count'] = xi.size
  122. else:
  123. r['marketer'] = handle_sample.marketer()
  124. r['prosecutor'] = handle_sample.prosecutor()
  125. r['unique ratio'] = handle_sample.unique_ratio()
  126. r['group count'] = xi.size
  127. if pop_size > 0 :
  128. handle_sample.set('pop_size',pop_size)
  129. r['pitman risk'] = handle_sample.pitman()
  130. if 'pop' in args :
  131. xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).size()}).reset_index()
  132. yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
  133. merged_groups = pd.merge(xi,yi,on=cols,how='inner')
  134. handle_population= Population()
  135. handle_population.set('merged_groups',merged_groups)
  136. r['pop. marketer'] = handle_population.marketer()
  137. r['pitman risk'] = handle_population.pitman()
  138. r['pop. group size'] = np.unique(yi.population_group_size).size
  139. #
  140. # At this point we have both columns for either sample,population or both
  141. #
  142. r['field count'] = len(cols)
  143. return pd.DataFrame([r])
  144. class Risk :
  145. """
  146. This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
  147. - Sample computes risk associated with a sample dataset only
  148. - Population computes risk associated with a population
  149. """
  150. def __init__(self):
  151. self.cache = {}
  152. def set(self,key,value):
  153. if id not in self.cache :
  154. self.cache[id] = {}
  155. self.cache[key] = value
  156. class Sample(Risk):
  157. """
  158. This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
  159. This class can optionally add pitman risk if the population size is known.
  160. """
  161. def __init__(self):
  162. Risk.__init__(self)
  163. def marketer(self):
  164. """
  165. computing marketer risk for sample dataset
  166. """
  167. groups = self.cache['groups']
  168. group_count = groups.size
  169. row_count = groups.sum()
  170. return group_count / np.float64(row_count)
  171. def prosecutor(self):
  172. """
  173. The prosecutor risk consists in determining 1 over the smallest group size
  174. It identifies if there is at least one record that is unique
  175. """
  176. groups = self.cache['groups']
  177. return 1 / np.float64(groups.min())
  178. def unique_ratio(self):
  179. groups = self.cache['groups']
  180. row_count = groups.sum()
  181. return groups[groups == 1].sum() / np.float64(row_count)
  182. def pitman(self):
  183. """
  184. This function will approximate pitman de-identification risk based on pitman sampling
  185. """
  186. groups = self.cache['groups']
  187. si = groups[groups == 1].size
  188. u = groups.size
  189. alpha = np.divide(si , np.float64(u) )
  190. f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
  191. return np.power(f,1-alpha)
  192. class Population(Sample):
  193. """
  194. This class will compute risk for datasets that have population information or datasets associated with them.
  195. This computation includes pitman risk (it requires minimal information about population)
  196. """
  197. def __init__(self,**args):
  198. Sample.__init__(self)
  199. def set(self,key,value):
  200. Sample.set(self,key,value)
  201. if key == 'merged_groups' :
  202. Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
  203. Sample.set(self,'groups',value.sample_group_size)
  204. """
  205. This class will measure risk and account for the existance of a population
  206. :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
  207. """
  208. def marketer(self):
  209. """
  210. This function requires
  211. """
  212. r = self.cache['merged_groups']
  213. sample_row_count = r.sample_group_size.sum()
  214. #
  215. # @TODO : make sure the above line is size (not sum)
  216. # sample_row_count = r.sample_group_size.size
  217. return r.apply(lambda row: (row.sample_group_size / np.float64(row.population_group_size)) /np.float64(sample_row_count) ,axis=1).sum()