pandas_risk.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. """
  2. Health Information Privacy Lab
  3. Steve L. Nyemba & Brad. Malin
  4. This is an extension to the pandas data-frame that will perform a risk assessment on a variety of attributes
  5. This implementation puts the responsibility on the user of the framework to join datasets and load the final results into a pandas data-frame.
  6. The code will randomly select fields and compute the risk (marketer and prosecutor) and perform a given number of runs.
  7. Usage:
  8. from pandas_risk import *
  9. mydataframe = pd.DataFrame('/myfile.csv')
  10. risk = mydataframe.deid.risk(id=<name of patient field>,num_runs=<number of runs>)
  11. @TODO:
  12. - Provide a selected number of fields and risk will be computed for those fields.
  13. - include journalist risk
  14. """
  15. import pandas as pd
  16. import numpy as np
  17. @pd.api.extensions.register_dataframe_accessor("deid")
  18. class deid :
  19. """
  20. This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
  21. """
  22. def __init__(self,df):
  23. self._df = df
  24. def risk(self,**args):
  25. """
  26. @param id name of patient field
  27. @params num_runs number of runs (default will be 100)
  28. """
  29. id = args['id']
  30. num_runs = args['num_runs'] if 'num_runs' in args else 100
  31. r = pd.DataFrame()
  32. columns = list(set(self._df.columns) - set([id]))
  33. k = len(columns)
  34. for i in range(0,num_runs) :
  35. #
  36. # let's chose a random number of columns and compute marketer and prosecutor risk
  37. # Once the fields are selected we run a groupby clause
  38. #
  39. n = np.random.randint(2,k) #-- number of random fields we are picking
  40. ii = np.random.choice(k,n,replace=False)
  41. cols = np.array(columns)[ii].tolist()
  42. x_ = self._df.groupby(cols).count()[id].values
  43. r = r.append(
  44. pd.DataFrame(
  45. [
  46. {
  47. "selected":n,
  48. "marketer": x_.size / np.float64(np.sum(x_)),
  49. "prosecutor":1 / np.float64(np.min(x_))
  50. }
  51. ]
  52. )
  53. )
  54. g_size = x_.size
  55. n_ids = np.float64(np.sum(x_))
  56. return r
  57. import pandas as pd
  58. import numpy as np
  59. from io import StringIO
  60. csv = """
  61. id,sex,age,profession,drug_test
  62. 1,M,37,doctor,-
  63. 2,F,28,doctor,+
  64. 3,M,37,doctor,-
  65. 4,M,28,doctor,+
  66. 5,M,28,doctor,-
  67. 6,M,37,doctor,-
  68. """
  69. f = StringIO()
  70. f.write(unicode(csv))
  71. f.seek(0)
  72. df = pd.read_csv(f)
  73. print df.deid.risk(id='id',num_runs=1)