risk.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. """
  2. Health Information Privacy Lab
  3. Brad. Malin, Weiyi Xia, Steve L. Nyemba
  4. This framework computes re-identification risk of a dataset assuming the data being shared can be loaded into a dataframe (pandas)
  5. The framework will compute the following risk measures:
  6. - marketer
  7. - prosecutor
  8. - pitman
  9. References :
  10. https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
  11. This framework integrates pandas (for now) as an extension and can be used in two modes :
  12. Experimental mode
  13. Here the assumption is that we are not sure of the attributes to be disclosed, the framework will explore a variety of combinations and associate risk measures every random combinations
  14. Evaluation mode
  15. The evaluation mode assumes the set of attributes given are known and thus will evaluate risk for a subset of attributes.
  16. features :
  17. - determine viable fields (quantifiable in terms of uniqueness). This is a way to identify fields that can act as identifiers.
  18. - explore and evaluate risk of a sample dataset against a known population dataset
  19. - explore and evaluate risk on a sample dataset
  20. Usage:
  21. from pandas_risk import *
  22. mydataframe = pd.DataFrame('/myfile.csv')
  23. resp = mydataframe.risk.evaluate(id=<name of patient field>,num_runs=<number of runs>,cols=[])
  24. resp = mydataframe.risk.explore(id=<name of patient field>,num_runs=<number of runs>,cols=[])
  25. @TODO:
  26. - Provide a selected number of fields and risk will be computed for those fields.
  27. - include journalist risk
  28. """
  29. import pandas as pd
  30. import numpy as np
  31. import logging
  32. import json
  33. from datetime import datetime
  34. import sys
  35. @pd.api.extensions.register_dataframe_accessor("risk")
  36. class deid :
  37. """
  38. This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
  39. """
  40. def __init__(self,df):
  41. self._df = df.fillna(' ')
  42. def explore(self,**args):
  43. """
  44. This function will perform experimentation by performing a random policies (combinations of attributes)
  45. This function is intended to explore a variety of policies and evaluate their associated risk.
  46. @param pop|sample data-frame with popublation reference
  47. @param id key field that uniquely identifies patient/customer ...
  48. """
  49. id = args['id']
  50. pop= args['pop'] if 'pop' in args else None
  51. if 'pop_size' in args :
  52. pop_size = np.float64(args['pop_size'])
  53. else:
  54. pop_size = -1
  55. #
  56. # Policies will be generated with a number of runs
  57. #
  58. RUNS = args['num_runs'] if 'num_runs' in args else 5
  59. sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
  60. k = sample.columns.size -1 if 'field_count' not in args else int(args['field_count'])
  61. columns = list(set(sample.columns.tolist()) - set([id]))
  62. o = pd.DataFrame()
  63. for i in np.arange(RUNS):
  64. n = np.random.randint(2,k)
  65. cols = np.random.choice(columns,n,replace=False).tolist()
  66. params = {'sample':sample,'cols':cols}
  67. if pop is not None :
  68. params['pop'] = pop
  69. if pop_size > 0 :
  70. params['pop_size'] = pop_size
  71. r = self.evaluate(**params)
  72. #
  73. # let's put the policy in place
  74. p = pd.DataFrame(1*sample.columns.isin(cols)).T
  75. p.columns = sample.columns
  76. o = o.append(r.join(p))
  77. o.index = np.arange(o.shape[0]).astype(np.int64)
  78. return o
  79. def evaluate(self, **args):
  80. """
  81. This function has the ability to evaluate risk associated with either a population or a sample dataset
  82. :sample sample dataset
  83. :pop population dataset
  84. :cols list of columns of interest or policies
  85. :flag user provided flag for the context of the evaluation
  86. """
  87. if 'sample' in args :
  88. sample = pd.DataFrame(args['sample'])
  89. else:
  90. sample = pd.DataFrame(self._df)
  91. if not args or 'cols' not in args:
  92. cols = sample.columns.tolist()
  93. elif args and 'cols' in args:
  94. cols = args['cols']
  95. flag = 'UNFLAGGED' if 'flag' not in args else args['flag']
  96. #
  97. # @TODO: auto select the columns i.e removing the columns that will have the effect of an identifier
  98. #
  99. # if 'population' in args :
  100. # pop = pd.DataFrame(args['population'])
  101. r = {"flag":flag}
  102. # if sample :
  103. handle_sample = Sample()
  104. xi = sample.groupby(cols,as_index=False).size().values
  105. handle_sample.set('groups',xi)
  106. if 'pop_size' in args :
  107. pop_size = np.float64(args['pop_size'])
  108. else:
  109. pop_size = -1
  110. #
  111. #-- The following conditional line is to address the labels that will be returned
  112. # @TODO: Find a more elegant way of doing this.
  113. #
  114. if 'pop' in args :
  115. r['sample marketer'] = handle_sample.marketer()
  116. r['sample prosecutor'] = handle_sample.prosecutor()
  117. r['sample unique ratio'] = handle_sample.unique_ratio()
  118. r['sample group count'] = xi.size
  119. else:
  120. r['marketer'] = handle_sample.marketer()
  121. r['prosecutor'] = handle_sample.prosecutor()
  122. r['unique ratio'] = handle_sample.unique_ratio()
  123. r['group count'] = xi.size
  124. if pop_size > 0 :
  125. handle_sample.set('pop_size',pop_size)
  126. r['pitman risk'] = handle_sample.pitman()
  127. if 'pop' in args :
  128. print cols
  129. print args['pop'].columns
  130. xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).size()}).reset_index()
  131. yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
  132. merged_groups = pd.merge(xi,yi,on=cols,how='inner')
  133. handle_population= Population()
  134. handle_population.set('merged_groups',merged_groups)
  135. r['pop. marketer'] = handle_population.marketer()
  136. r['pitman risk'] = handle_population.pitman()
  137. r['pop. group size'] = np.unique(yi.population_group_size).size
  138. #
  139. # At this point we have both columns for either sample,population or both
  140. #
  141. r['field count'] = len(cols)
  142. return pd.DataFrame([r])
  143. class Risk :
  144. """
  145. This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
  146. - Sample computes risk associated with a sample dataset only
  147. - Population computes risk associated with a population
  148. """
  149. def __init__(self):
  150. self.cache = {}
  151. def set(self,key,value):
  152. if id not in self.cache :
  153. self.cache[id] = {}
  154. self.cache[key] = value
  155. class Sample(Risk):
  156. """
  157. This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
  158. This class can optionally add pitman risk if the population size is known.
  159. """
  160. def __init__(self):
  161. Risk.__init__(self)
  162. def marketer(self):
  163. """
  164. computing marketer risk for sample dataset
  165. """
  166. groups = self.cache['groups']
  167. group_count = groups.size
  168. row_count = groups.sum()
  169. return group_count / np.float64(row_count)
  170. def prosecutor(self):
  171. """
  172. The prosecutor risk consists in determining 1 over the smallest group size
  173. It identifies if there is at least one record that is unique
  174. """
  175. groups = self.cache['groups']
  176. return 1 / np.float64(groups.min())
  177. def unique_ratio(self):
  178. groups = self.cache['groups']
  179. row_count = groups.sum()
  180. return groups[groups == 1].sum() / np.float64(row_count)
  181. def pitman(self):
  182. """
  183. This function will approximate pitman de-identification risk based on pitman sampling
  184. """
  185. groups = self.cache['groups']
  186. si = groups[groups == 1].size
  187. u = groups.size
  188. alpha = np.divide(si , np.float64(u) )
  189. f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
  190. return np.power(f,1-alpha)
  191. class Population(Sample):
  192. """
  193. This class will compute risk for datasets that have population information or datasets associated with them.
  194. This computation includes pitman risk (it requires minimal information about population)
  195. """
  196. def __init__(self,**args):
  197. Sample.__init__(self)
  198. def set(self,key,value):
  199. Sample.set(self,key,value)
  200. if key == 'merged_groups' :
  201. Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
  202. Sample.set(self,'groups',value.sample_group_size)
  203. """
  204. This class will measure risk and account for the existance of a population
  205. :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
  206. """
  207. def marketer(self):
  208. """
  209. This function requires
  210. """
  211. r = self.cache['merged_groups']
  212. sample_row_count = r.sample_group_size.sum()
  213. #
  214. # @TODO : make sure the above line is size (not sum)
  215. # sample_row_count = r.sample_group_size.size
  216. return r.apply(lambda row: (row.sample_group_size / np.float64(row.population_group_size)) /np.float64(sample_row_count) ,axis=1).sum()