pandas_risk.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. """
  2. Health Information Privacy Lab
  3. Steve L. Nyemba & Brad. Malin
  4. This is an extension to the pandas data-frame that will perform a risk assessment on a variety of attributes
  5. This implementation puts the responsibility on the user of the framework to join datasets and load the final results into a pandas data-frame.
  6. The code will randomly select fields and compute the risk (marketer and prosecutor) and perform a given number of runs.
  7. Usage:
  8. from pandas_risk import *
  9. mydataframe = pd.DataFrame('/myfile.csv')
  10. risk = mydataframe.deid.risk(id=<name of patient field>,num_runs=<number of runs>)
  11. @TODO:
  12. - Provide a selected number of fields and risk will be computed for those fields.
  13. - include journalist risk
  14. """
  15. import pandas as pd
  16. import numpy as np
  17. @pd.api.extensions.register_dataframe_accessor("deid")
  18. class deid :
  19. """
  20. This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
  21. """
  22. def __init__(self,df):
  23. self._df = df
  24. def risk(self,**args):
  25. """
  26. @param id name of patient field
  27. @params num_runs number of runs (default will be 100)
  28. @params quasi_id list of quasi identifiers to be used (this will only perform a single run)
  29. """
  30. id = args['id']
  31. if 'quasi_id' in args :
  32. num_runs = 1
  33. columns = list(set(args['quasi_id'])- set(id) )
  34. else :
  35. num_runs = args['num_runs'] if 'num_runs' in args else 100
  36. columns = list(set(self._df.columns) - set([id]))
  37. r = pd.DataFrame()
  38. k = len(columns)
  39. N = self._df.shape[0]
  40. tmp = self._df.fillna(' ')
  41. np.random.seed(1)
  42. for i in range(0,num_runs) :
  43. #
  44. # let's chose a random number of columns and compute marketer and prosecutor risk
  45. # Once the fields are selected we run a groupby clause
  46. #
  47. if 'quasi_id' not in args :
  48. if 'field_count' in args :
  49. #
  50. # We chose to limit how many fields we passin
  51. n = np.random.randint(2,int(args['field_count'])) #-- number of random fields we are picking
  52. else :
  53. n = np.random.randint(2,k) #-- number of random fields we are picking
  54. ii = np.random.choice(k,n,replace=False)
  55. cols = np.array(columns)[ii].tolist()
  56. policy = np.zeros(k)
  57. policy [ii] = 1
  58. policy = pd.DataFrame(policy).T
  59. else:
  60. cols = columns
  61. policy = np.ones(k)
  62. policy = pd.DataFrame(policy).T
  63. n = len(cols)
  64. policy.columns = columns
  65. N = tmp.shape[0]
  66. x_ = tmp.groupby(cols).size().values
  67. # print [id,i,n,k,self._df.groupby(cols).count()]
  68. r = r.append(
  69. pd.DataFrame(
  70. [
  71. {
  72. "group_count":x_.size,
  73. "patient_count":N,
  74. "field_count":n,
  75. "marketer": x_.size / np.float64(np.sum(x_)),
  76. "prosecutor":1 / np.float64(np.min(x_))
  77. }
  78. ]
  79. ).join(policy)
  80. )
  81. # g_size = x_.size
  82. # n_ids = np.float64(np.sum(x_))
  83. # sql = """
  84. # SELECT COUNT(g_size) as group_count, :patient_count as patient_count,SUM(g_size) as rec_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor, :n as field_count
  85. # FROM (
  86. # SELECT COUNT(*) as g_size,:key,:fields
  87. # FROM :full_name
  88. # GROUP BY :fields
  89. # """.replace(":n",str(n)).replace(":fields",",".join(cols)).replace(":key",id).replace(":patient_count",str(N))
  90. # r.append(self._df.query(sql.replace("\n"," ").replace("\r"," ") ))
  91. return r
  92. # df = pd.read_gbq("select * from deid_risk.risk_30k",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
  93. # r = df.deid.risk(id='person_id',num_runs=200)
  94. # print r[['field_count','patient_count','marketer','prosecutor']]