risk.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. """
  2. Health Information Privacy Lab
  3. Brad. Malin, Weiyi Xia, Steve L. Nyemba
  4. This framework computes re-identification risk of a dataset assuming the data being shared can be loaded into a dataframe (pandas)
  5. The framework will compute the following risk measures:
  6. - marketer
  7. - prosecutor
  8. - pitman
  9. References :
  10. https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
  11. This framework integrates pandas (for now) as an extension and can be used in two modes :
  12. Experimental mode
  13. Here the assumption is that we are not sure of the attributes to be disclosed, the framework will explore a variety of combinations and associate risk measures every random combinations
  14. Evaluation mode
  15. The evaluation mode assumes the set of attributes given are known and thus will evaluate risk for a subset of attributes.
  16. features :
  17. - determine viable fields (quantifiable in terms of uniqueness). This is a way to identify fields that can act as identifiers.
  18. - explore and evaluate risk of a sample dataset against a known population dataset
  19. - explore and evaluate risk on a sample dataset
  20. Usage:
  21. from pandas_risk import *
  22. mydataframe = pd.DataFrame('/myfile.csv')
  23. resp = mydataframe.risk.evaluate(id=<name of patient field>,num_runs=<number of runs>,cols=[])
  24. resp = mydataframe.risk.explore(id=<name of patient field>,num_runs=<number of runs>,cols=[])
  25. @TODO:
  26. - Provide a selected number of fields and risk will be computed for those fields.
  27. - include journalist risk
  28. """
  29. import pandas as pd
  30. import numpy as np
  31. import logging
  32. import json
  33. from datetime import datetime
  34. import sys
  35. from itertools import combinations
  36. # class Compute:
  37. # pass
  38. # class Population(Compute):
  39. # pass
  40. @pd.api.extensions.register_dataframe_accessor("risk")
  41. class deid :
  42. """
  43. This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
  44. """
  45. def __init__(self,df):
  46. self._df = df.fillna(' ')
  47. #
  48. # Let's get the distribution of the values so we know what how unique the fields are
  49. #
  50. values = df.apply(lambda col: col.unique().size / df.shape[0])
  51. self._dinfo = dict(zip(df.columns.tolist(),values))
  52. # self.sample = self._df
  53. self.init(sample=self._df)
  54. def init(self,**_args):
  55. _sample = _args['sample'] if 'sample' in _args else self._df
  56. _columns = [] if 'columns' not in _args else _args['columns']
  57. if _columns :
  58. self._compute = Compute(sample = _sample,columns=_columns)
  59. else:
  60. self._comput = Compute(sample=_sample)
  61. self._pcompute= Population()
  62. def explore(self,**args):
  63. """
  64. This function will perform experimentation by performing a random policies (combinations of attributes)
  65. This function is intended to explore a variety of policies and evaluate their associated risk.
  66. :pop|sample data-frame with population or sample reference
  67. :field_count number of fields to randomly select
  68. :strict if set the field_count is exact otherwise field_count is range from 2-field_count
  69. :num_runs number of runs (by default 5)
  70. """
  71. pop= args['pop'] if 'pop' in args else None
  72. if 'pop_size' in args :
  73. pop_size = np.float64(args['pop_size'])
  74. else:
  75. pop_size = -1
  76. #
  77. # Policies will be generated with a number of runs
  78. #
  79. RUNS = args['num_runs'] if 'num_runs' in args else 5
  80. sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
  81. k = sample.columns.size if 'field_count' not in args else int(args['field_count']) +1
  82. #
  83. # remove fields that are unique, they function as identifiers
  84. #
  85. if 'id' in args :
  86. id = args['id']
  87. columns = list(set(sample.columns.tolist()) - set([id]))
  88. else:
  89. columns = sample.columns.tolist()
  90. # If columns are not specified we can derive them from self._dinfo
  91. # given the distribution all fields that are < 1 will be accounted for
  92. # columns = args['cols'] if 'cols' in args else [key for key in self._dinfo if self._dinfo[key] < 1]
  93. o = pd.DataFrame()
  94. columns = [key for key in self._dinfo if self._dinfo[key] < 1]
  95. _policy_count = 2 if 'policy_count' not in args else int(args['policy_count'])
  96. _policies = []
  97. _index = 0
  98. for size in np.arange(2,len(columns)) :
  99. p = list(combinations(columns,size))
  100. p = (np.array(p)[ np.random.choice( len(p), _policy_count)].tolist())
  101. for cols in p :
  102. flag = 'Policy_'+str(_index)
  103. r = self.evaluate(sample=sample,cols=cols,flag = flag)
  104. p = pd.DataFrame(1*sample.columns.isin(cols)).T
  105. p.columns = sample.columns
  106. o = pd.concat([o,r.join(p)])
  107. o['attributes'] = ','.join(cols)
  108. # o['attr'] = ','.join(r.apply())
  109. _index += 1
  110. #
  111. # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr
  112. #
  113. o.index = np.arange(o.shape[0]).astype(np.int64)
  114. o = o.rename(columns={'flag':'policies'})
  115. return o
  116. def evaluate(self,**_args):
  117. _measure = {}
  118. self.init(**_args)
  119. _names = ['marketer','journalist','prosecutor'] #+ (['pitman'] if 'pop_size' in _args else [])
  120. for label in _names :
  121. _pointer = getattr(self,label)
  122. _measure[label] = _pointer(**_args)
  123. _measure['fields'] = self._compute.cache['count']['fields']
  124. _measure['groups'] = self._compute.cache['count']['groups']
  125. _measure['rows'] = self._compute.cache['count']['rows']
  126. if 'attr' in _args :
  127. _measure = dict(_args['attr'],**_measure)
  128. return pd.DataFrame([_measure])
  129. def _evaluate(self, **args):
  130. """
  131. This function has the ability to evaluate risk associated with either a population or a sample dataset
  132. :sample sample dataset
  133. :pop population dataset
  134. :cols list of columns of interest or policies
  135. :flag user provided flag for the context of the evaluation
  136. """
  137. if 'sample' in args :
  138. sample = pd.DataFrame(args['sample'])
  139. else:
  140. sample = pd.DataFrame(self._df)
  141. if not args or 'cols' not in args:
  142. # cols = sample.columns.tolist()
  143. cols = [key for key in self._dinfo if self._dinfo[key] < 1]
  144. elif args and 'cols' in args:
  145. cols = args['cols']
  146. #
  147. #
  148. flag = 'UNFLAGGED' if 'flag' not in args else args['flag']
  149. #
  150. # @TODO: auto select the columns i.e removing the columns that will have the effect of an identifier
  151. #
  152. # if 'population' in args :
  153. # pop = pd.DataFrame(args['population'])
  154. r = {"flag":flag}
  155. # if sample :
  156. handle_sample = Compute()
  157. xi = sample.groupby(cols,as_index=False).count().values
  158. handle_sample.set('groups',xi)
  159. if 'pop_size' in args :
  160. pop_size = np.float64(args['pop_size'])
  161. else:
  162. pop_size = -1
  163. #
  164. #-- The following conditional line is to address the labels that will be returned
  165. # @TODO: Find a more elegant way of doing this.
  166. #
  167. if 'pop' in args :
  168. label_market = 'sample marketer'
  169. label_prosec = 'sample prosecutor'
  170. label_groupN = 'sample group count'
  171. label_unique = 'sample journalist' #'sample unique ratio'
  172. # r['sample marketer'] = handle_sample.marketer()
  173. # r['sample prosecutor'] = handle_sample.prosecutor()
  174. # r['sample unique ratio'] = handle_sample.unique_ratio()
  175. # r['sample group count'] = xi.size
  176. # r['sample group count'] = len(xi)
  177. else:
  178. label_market = 'marketer'
  179. label_prosec = 'prosecutor'
  180. label_groupN = 'group count'
  181. label_unique = 'journalist' #'unique ratio'
  182. # r['marketer'] = handle_sample.marketer()
  183. # r['prosecutor'] = handle_sample.prosecutor()
  184. # r['unique ratio'] = handle_sample.unique_ratio()
  185. # r['group count'] = xi.size
  186. # r['group count'] = len(xi)
  187. if pop_size > 0 :
  188. handle_sample.set('pop_size',pop_size)
  189. r['pitman risk'] = handle_sample.pitman()
  190. r[label_market] = handle_sample.marketer()
  191. r[label_unique] = handle_sample.unique_ratio()
  192. r[label_prosec] = handle_sample.prosecutor()
  193. r[label_groupN] = len(xi)
  194. if 'pop' in args :
  195. xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
  196. yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
  197. merged_groups = pd.merge(xi,yi,on=cols,how='inner')
  198. handle_population= Population()
  199. handle_population.set('merged_groups',merged_groups)
  200. r['pop. marketer'] = handle_population.marketer()
  201. r['pitman risk'] = handle_population.pitman()
  202. r['pop. group size'] = np.unique(yi.population_group_size).size
  203. #
  204. # At this point we have both columns for either sample,population or both
  205. #
  206. r['field count'] = len(cols)
  207. return pd.DataFrame([r])
  208. def marketer(self,**_args):
  209. """
  210. This function delegates the calls to compute marketer risk of a given dataset or sample
  211. :sample optional sample dataset
  212. :columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns
  213. """
  214. if 'pop' not in _args :
  215. if not 'sample' in _args and not 'columns' in _args :
  216. # _handler = self._compute
  217. pass
  218. else:
  219. self.init(**_args)
  220. # _handler = Compute(**_args)
  221. _handler = self._compute
  222. else:
  223. #
  224. # Computing population estimates for the population
  225. self._pcompute.init(**_args)
  226. handler = self._pcompute
  227. return _handler.marketer()
  228. def journalist(self,**_args):
  229. """
  230. This function delegates the calls to compute journalist risk of a given dataset or sample
  231. :sample optional sample dataset
  232. :columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns
  233. """
  234. if 'pop' not in _args :
  235. if not 'sample' in _args and not 'columns' in _args :
  236. _handler = self._compute
  237. else:
  238. self.init(**_args)
  239. # _handler = Compute(**_args)
  240. _handler = self._compute
  241. # return _compute.journalist()
  242. else:
  243. self._pcompute.init(**_args)
  244. _handler = self._pcompute
  245. return _handler.journalist()
  246. def prosecutor(self,**_args):
  247. """
  248. This function delegates the calls to compute prosecutor risk of a given dataset or sample
  249. :sample optional sample dataset
  250. :columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns
  251. """
  252. if 'pop' not in _args :
  253. if not 'sample' in _args and not 'columns' in _args :
  254. # _handler = self._compute
  255. pass
  256. else:
  257. self.init(**_args)
  258. # _handler = Compute(**_args)
  259. _handler = self._compute
  260. else:
  261. self._pcompute.init(**_args)
  262. _handler = self._pcompute
  263. return _handler.prosecutor()
  264. def pitman(self,**_args):
  265. if 'population' not in _args :
  266. pop_size = int(_args['pop_size'])
  267. self._compute.set('pop_size',pop_size)
  268. _handler = self._compute;
  269. else:
  270. self._pcompute.init(**_args)
  271. _handler = self._pcompute
  272. return _handler.pitman()
  273. # xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
  274. # yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
  275. # merged_groups = pd.merge(xi,yi,on=cols,how='inner')
  276. # handle_population= Population()
  277. # handle_population.set('merged_groups',merged_groups)
  278. class Risk :
  279. """
  280. This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
  281. - Sample computes risk associated with a sample dataset only
  282. - Population computes risk associated with a population
  283. """
  284. def __init__(self):
  285. self.cache = {}
  286. def set(self,key,value):
  287. if id not in self.cache :
  288. self.cache[id] = {}
  289. self.cache[key] = value
  290. class Compute(Risk):
  291. """
  292. This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
  293. This class can optionally add pitman risk if the population size is known.
  294. """
  295. def __init__(self,**_args):
  296. super().__init__()
  297. self._sample = _args['sample'] if 'sample' in _args else pd.DataFrame()
  298. self._columns= _args['columns'] if 'columns' in _args else None
  299. self.cache['count'] = {'groups':0,'fields':0,'rows':0}
  300. if not self._columns :
  301. values = self._sample.apply(lambda col: col.unique().size / self._sample.shape[0])
  302. self._dinfo = dict(zip(self._sample.columns.tolist(),values))
  303. self._columns = [key for key in self._dinfo if self._dinfo[key] < 1]
  304. #
  305. # At this point we have all the columns that are valid candidates even if the user didn't specify them
  306. self.cache['count']['fields'] = len(self._columns)
  307. if self._sample.shape[0] > 0 and self._columns:
  308. _sample = _args ['sample']
  309. _groups = self._sample.groupby(self._columns,as_index=False).count().values
  310. self.set('groups',_groups)
  311. self.cache['count']['groups'] = len(_groups)
  312. self.cache['count']['rows'] = np.sum([_g[-1] for _g in _groups])
  313. def marketer(self):
  314. """
  315. computing marketer risk for sample dataset
  316. """
  317. groups = self.cache['groups']
  318. # group_count = groups.size
  319. # row_count = groups.sum()
  320. # group_count = len(groups)
  321. group_count = self.cache['count']['groups']
  322. # row_count = np.sum([_g[-1] for _g in groups])
  323. row_count = self.cache['count']['rows']
  324. return group_count / np.float64(row_count)
  325. def prosecutor(self):
  326. """
  327. The prosecutor risk consists in determining 1 over the smallest group size
  328. It identifies if there is at least one record that is unique
  329. """
  330. groups = self.cache['groups']
  331. _min = np.min([_g[-1] for _g in groups])
  332. # return 1 / np.float64(groups.min())
  333. return 1/ np.float64(_min)
  334. def unique_ratio(self):
  335. groups = self.cache['groups']
  336. # row_count = groups.sum()
  337. # row_count = np.sum([_g[-1] for _g in groups])
  338. row_count = self.cache['count']['rows']
  339. # return groups[groups == 1].sum() / np.float64(row_count)
  340. values = [_g[-1] for _g in groups if _g[-1] == 1]
  341. return np.sum(values) / np.float64(row_count)
  342. def journalist(self):
  343. return self.unique_ratio()
  344. def pitman(self):
  345. """
  346. This function will approximate pitman de-identification risk based on pitman sampling
  347. """
  348. groups = self.cache['groups']
  349. print (self.cache['pop_size'])
  350. si = groups[groups == 1].size
  351. # u = groups.size
  352. u = len(groups)
  353. alpha = np.divide(si , np.float64(u) )
  354. # row_count = np.sum([_g[-1] for _g in groups])
  355. row_count = self.cache['count']['rows']
  356. # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
  357. f = np.divide(row_count, np.float64(self.cache['pop_size']))
  358. return np.power(f,1-alpha)
  359. class Population(Compute):
  360. """
  361. This class will compute risk for datasets that have population information or datasets associated with them.
  362. This computation includes pitman risk (it requires minimal information about population)
  363. """
  364. def __init__(self,**_args):
  365. super().__init__(**_args)
  366. def init(self,**_args):
  367. xi = pd.DataFrame({"sample_group_size":self._sample.groupby(self._columns,as_index=False).count()}).reset_index()
  368. yi = pd.DataFrame({"population_group_size":_args['population'].groupby(self._columns,as_index=False).size()}).reset_index()
  369. merged_groups = pd.merge(xi,yi,on=self._columns,how='inner')
  370. self.set('merged_groups',merged_groups)
  371. def set(self,key,value):
  372. self.set(self,key,value)
  373. if key == 'merged_groups' :
  374. self.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
  375. self.set(self,'groups',value.sample_group_size)
  376. """
  377. This class will measure risk and account for the existance of a population
  378. :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
  379. """
  380. def marketer(self):
  381. """
  382. This function requires
  383. """
  384. r = self.cache['merged_groups']
  385. sample_row_count = r.sample_group_size.sum()
  386. #
  387. # @TODO : make sure the above line is size (not sum)
  388. # sample_row_count = r.sample_group_size.size
  389. return r.apply(lambda row: (row.sample_group_size / np.float64(row.population_group_size)) /np.float64(sample_row_count) ,axis=1).sum()