浏览代码

Merge branch 'dev' of hiplab/privacykit into master

Steve Nyemba 2 年之前
父节点
当前提交
c167058c23
共有 3 个文件被更改,包括 180 次插入50 次删除
  1. 9 7
      README.md
  2. 168 40
      privacykit/risk.py
  3. 3 3
      setup.py

+ 9 - 7
README.md

@@ -3,6 +3,8 @@
 This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
 This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
 The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
 The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
 
 
+
+
 There are two modes available :
 There are two modes available :
     
     
 **explore:**
 **explore:**
@@ -16,10 +18,10 @@ Here the assumption is that we are clear on the sets of attributes to be used an
 
 
 ### Four risk measures are computed :
 ### Four risk measures are computed :
 
 
-    - Marketer risk
-    - Prosecutor risk
-    - Journalist risk
-    - Pitman Risk
+- Marketer risk
+- Prosecutor risk
+- Journalist risk
+- Pitman Risk [Video tutorial,by Dr. Weiyi Xia](https://www.loom.com/share/173e109ecac64d37a54f09b103bc6681) and [Publication by Dr. Nobuaki Hoshino](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
 
 
 ### Usage:
 ### Usage:
 
 
@@ -27,19 +29,19 @@ Install this package using pip as follows :
 
 
 Stable :
 Stable :
     
     
-    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git
+    pip install git+https://dev.the-phi.com/git/healthcareio/privacykit.git@release
     
     
     
     
 Latest Development (not fully tested):
 Latest Development (not fully tested):
     
     
-    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk
+    pip install git+https://dev.the-phi.com/git/healthcareio/privacykit.git@dev
     
     
 The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
 The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
 
 
 
 
     import numpy as np
     import numpy as np
     import pandas as pd
     import pandas as pd
-    import risk
+    import privacykit
 
 
     mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50)  })
     mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50)  })
     print (mydf.risk.evaluate())
     print (mydf.risk.evaluate())

+ 168 - 40
privacykit/risk.py

@@ -43,6 +43,10 @@ from datetime import datetime
 import sys
 import sys
 
 
 from itertools import combinations
 from itertools import combinations
+# class Compute:
+#     pass
+# class Population(Compute):
+#     pass
 
 
 @pd.api.extensions.register_dataframe_accessor("risk")
 @pd.api.extensions.register_dataframe_accessor("risk")
 class deid :
 class deid :
@@ -57,6 +61,16 @@ class deid :
         #
         #
         values = df.apply(lambda col: col.unique().size / df.shape[0])
         values = df.apply(lambda col: col.unique().size / df.shape[0])
         self._dinfo = dict(zip(df.columns.tolist(),values))
         self._dinfo = dict(zip(df.columns.tolist(),values))
+        # self.sample = self._df
+        self.init(sample=self._df)
+    def init(self,**_args):
+        _sample = _args['sample'] if 'sample' in _args else self._df
+        _columns = [] if 'columns' not in _args else _args['columns']
+        if _columns :
+            self._compute = Compute(sample = _sample,columns=_columns)
+        else:
+            self._comput = Compute(sample=_sample)
+        self._pcompute= Population()  
 
 
     def explore(self,**args):
     def explore(self,**args):
         """
         """
@@ -107,40 +121,45 @@ class deid :
         for size in np.arange(2,len(columns)) :
         for size in np.arange(2,len(columns)) :
             p = list(combinations(columns,size))            
             p = list(combinations(columns,size))            
             p = (np.array(p)[ np.random.choice( len(p), _policy_count)].tolist())
             p = (np.array(p)[ np.random.choice( len(p), _policy_count)].tolist())
-            flag = 'Policy_'+str(_index)
-            _index += 1
+            
+            
             for cols in p :
             for cols in p :
+                flag = 'Policy_'+str(_index)
                 r = self.evaluate(sample=sample,cols=cols,flag = flag)
                 r = self.evaluate(sample=sample,cols=cols,flag = flag)
                 p =  pd.DataFrame(1*sample.columns.isin(cols)).T
                 p =  pd.DataFrame(1*sample.columns.isin(cols)).T
                 p.columns = sample.columns
                 p.columns = sample.columns
                 o = pd.concat([o,r.join(p)])
                 o = pd.concat([o,r.join(p)])
-        
-           
-        # for i in np.arange(RUNS):
-        #     if 'strict' not in args or ('strict' in args and args['strict'] is False):
-        #         n = np.random.randint(2,k)
-        #     else:
-        #         n = args['field_count']
-        #     cols = np.random.choice(columns,n,replace=False).tolist()            
-        #     params = {'sample':sample,'cols':cols}
-        #     if pop is not None :
-        #         params['pop'] = pop
-        #     if pop_size > 0  :
-        #         params['pop_size'] = pop_size
 
 
-        #     r = self.evaluate(**params)
-        #     #
-        #     # let's put the policy in place
-        #     p =  pd.DataFrame(1*sample.columns.isin(cols)).T
-        #     p.columns = sample.columns
-        #     # o = o.append(r.join(p))
-        #     o = pd.concat([o,r.join(p)])
+                o['attributes'] = ','.join(cols)
+                # o['attr'] = ','.join(r.apply())
+                _index += 1
+        #
+        # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr
+        #
+           
+      
 
 
             
             
         o.index = np.arange(o.shape[0]).astype(np.int64)
         o.index = np.arange(o.shape[0]).astype(np.int64)
-
+        o = o.rename(columns={'flag':'policies'})
         return o
         return o
-    def evaluate(self, **args):
+    def evaluate(self,**_args):
+        _measure = {}
+
+        self.init(**_args)
+        _names = ['marketer','journalist','prosecutor'] #+ (['pitman'] if 'pop_size' in _args else [])
+        for label in _names :
+            _pointer = getattr(self,label)
+            _measure[label] = _pointer(**_args)
+        
+        _measure['fields'] = self._compute.cache['count']['fields']
+        _measure['groups'] = self._compute.cache['count']['groups']
+        _measure['rows'] = self._compute.cache['count']['rows']
+        if 'attr' in _args :
+            _measure = dict(_args['attr'],**_measure)
+
+        return pd.DataFrame([_measure])
+    def _evaluate(self, **args):
         """
         """
         This function has the ability to evaluate risk associated with either a population or a sample dataset
         This function has the ability to evaluate risk associated with either a population or a sample dataset
         :sample sample dataset
         :sample sample dataset
@@ -170,7 +189,7 @@ class deid :
         r = {"flag":flag}
         r = {"flag":flag}
         # if sample :
         # if sample :
         
         
-        handle_sample   = Sample()        
+        handle_sample   = Compute()        
         xi              = sample.groupby(cols,as_index=False).count().values
         xi              = sample.groupby(cols,as_index=False).count().values
         
         
         handle_sample.set('groups',xi)
         handle_sample.set('groups',xi)
@@ -226,7 +245,83 @@ class deid :
         #
         #
         r['field count'] = len(cols)
         r['field count'] = len(cols)
         return pd.DataFrame([r])
         return pd.DataFrame([r])
+    
+    def marketer(self,**_args):
+        """
+        This function delegates the calls to compute marketer risk of a given dataset or sample
+        :sample     optional sample dataset
+        :columns    optional columns of the dataset, if non is provided and inference will be made using non-unique columns
+        """
+        if 'pop' not in _args :
+            if not 'sample' in _args and not 'columns' in _args :
+                # _handler =  self._compute
+                pass
+            else:
+                
+                self.init(**_args)
+                # _handler = Compute(**_args)
+            _handler =  self._compute
 
 
+        else:
+            #
+            # Computing population estimates for the population
+            self._pcompute.init(**_args)
+            handler = self._pcompute
+        return _handler.marketer()
+    def journalist(self,**_args):
+        """
+        This function delegates the calls to compute journalist risk of a given dataset or sample
+        :sample     optional sample dataset
+        :columns    optional columns of the dataset, if non is provided and inference will be made using non-unique columns
+        """
+        if 'pop' not in _args :
+            if not 'sample' in _args and not 'columns' in _args :
+                _handler =  self._compute
+            else:
+                self.init(**_args)
+                # _handler = Compute(**_args)
+            _handler = self._compute
+                # return _compute.journalist()
+        else:
+            self._pcompute.init(**_args)
+            _handler = self._pcompute
+        return _handler.journalist()
+    def prosecutor(self,**_args):
+        """
+        This function delegates the calls to compute prosecutor risk of a given dataset or sample
+        :sample     optional sample dataset
+        :columns    optional columns of the dataset, if non is provided and inference will be made using non-unique columns
+        """
+        if 'pop' not in _args :
+            if not 'sample' in _args and not 'columns' in _args :
+                # _handler =  self._compute
+                pass
+            else:
+                self.init(**_args)
+                # _handler = Compute(**_args)
+            _handler =  self._compute
+                
+        else:
+            self._pcompute.init(**_args)
+            _handler = self._pcompute
+        return _handler.prosecutor()
+    def pitman(self,**_args):
+        
+        if 'population' not in _args :
+            pop_size = int(_args['pop_size'])
+            self._compute.set('pop_size',pop_size)
+            _handler =  self._compute;
+        else:
+            self._pcompute.init(**_args)
+            _handler = self._pcompute
+        
+        return _handler.pitman()
+        
+        # xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
+        # yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
+        # merged_groups = pd.merge(xi,yi,on=cols,how='inner')
+        # handle_population= Population()            
+        # handle_population.set('merged_groups',merged_groups)
 class Risk :
 class Risk :
     """
     """
     This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
     This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
@@ -240,24 +335,44 @@ class Risk :
             self.cache[id] = {}
             self.cache[id] = {}
         self.cache[key] = value
         self.cache[key] = value
 
 
-class Sample(Risk):
+class Compute(Risk):
     """
     """
     This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
     This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
     This class can optionally add pitman risk if the population size is known.
     This class can optionally add pitman risk if the population size is known.
     """
     """
-    def __init__(self):
-        Risk.__init__(self)
+    def __init__(self,**_args):
+        super().__init__()
+        self._sample = _args['sample'] if 'sample' in _args else pd.DataFrame()
+        self._columns= _args['columns'] if 'columns' in _args else None
+        self.cache['count']  = {'groups':0,'fields':0,'rows':0}
+        if not self._columns :
+            values = self._sample.apply(lambda col: col.unique().size / self._sample.shape[0])            
+            self._dinfo = dict(zip(self._sample.columns.tolist(),values))
+            self._columns = [key for key in self._dinfo if self._dinfo[key] < 1]
+        #
+        # At this point we have all the columns that are valid candidates even if the user didn't specify them
+        self.cache['count']['fields'] = len(self._columns)
+        if self._sample.shape[0] > 0 and self._columns:
+            _sample = _args ['sample']
+            _groups = self._sample.groupby(self._columns,as_index=False).count().values
+            self.set('groups',_groups)
+    
+            self.cache['count']['groups']  = len(_groups)
+            self.cache['count']['rows']    = np.sum([_g[-1] for _g in _groups])
+            
     def marketer(self):
     def marketer(self):
         """
         """
         computing marketer risk for sample dataset
         computing marketer risk for sample dataset
         """
         """
         
         
-            
+        
         groups = self.cache['groups']
         groups = self.cache['groups']
         # group_count = groups.size
         # group_count = groups.size
         # row_count   = groups.sum()
         # row_count   = groups.sum()
-        group_count = len(groups)
-        row_count = np.sum([_g[-1] for _g in groups])
+        # group_count = len(groups)
+        group_count = self.cache['count']['groups']
+        # row_count = np.sum([_g[-1] for _g in groups])
+        row_count = self.cache['count']['rows']
         return group_count / np.float64(row_count)
         return group_count / np.float64(row_count)
 
 
     def prosecutor(self):
     def prosecutor(self):
@@ -272,40 +387,52 @@ class Sample(Risk):
     def unique_ratio(self):
     def unique_ratio(self):
         groups = self.cache['groups']        
         groups = self.cache['groups']        
         # row_count = groups.sum()
         # row_count = groups.sum()
-        row_count = np.sum([_g[-1] for _g in groups])
+        # row_count = np.sum([_g[-1] for _g in groups])
+        row_count = self.cache['count']['rows']
         # return groups[groups == 1].sum() / np.float64(row_count)
         # return groups[groups == 1].sum() / np.float64(row_count)
         values = [_g[-1] for _g in groups if _g[-1] == 1]
         values = [_g[-1] for _g in groups if _g[-1] == 1]
         
         
         return np.sum(values) / np.float64(row_count)
         return np.sum(values) / np.float64(row_count)
-
+    def journalist(self):
+        return self.unique_ratio()
     def pitman(self):
     def pitman(self):
         """
         """
         This function will approximate pitman de-identification risk based on pitman sampling
         This function will approximate pitman de-identification risk based on pitman sampling
         """
         """
+        
         groups = self.cache['groups']
         groups = self.cache['groups']
+        print (self.cache['pop_size'])
         si = groups[groups == 1].size
         si = groups[groups == 1].size
         # u = groups.size
         # u = groups.size
         u = len(groups)
         u = len(groups)
         alpha = np.divide(si , np.float64(u) )
         alpha = np.divide(si , np.float64(u) )
-        row_count = np.sum([_g[-1] for _g in groups])
+        # row_count = np.sum([_g[-1] for _g in groups])
+        row_count = self.cache['count']['rows']
+
         # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
         # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
         f = np.divide(row_count, np.float64(self.cache['pop_size']))
         f = np.divide(row_count, np.float64(self.cache['pop_size']))
         return np.power(f,1-alpha)
         return np.power(f,1-alpha)
 
 
-class Population(Sample):
+class Population(Compute):
     """
     """
     This class will compute risk for datasets that have population information or datasets associated with them.
     This class will compute risk for datasets that have population information or datasets associated with them.
     This computation includes pitman risk (it requires minimal information about population)
     This computation includes pitman risk (it requires minimal information about population)
     """
     """
-    def __init__(self,**args):
-        Sample.__init__(self)
+    def __init__(self,**_args):
+        super().__init__(**_args)
+
+    def init(self,**_args):
+        xi = pd.DataFrame({"sample_group_size":self._sample.groupby(self._columns,as_index=False).count()}).reset_index()
+        yi = pd.DataFrame({"population_group_size":_args['population'].groupby(self._columns,as_index=False).size()}).reset_index()
+        merged_groups = pd.merge(xi,yi,on=self._columns,how='inner')                   
+        self.set('merged_groups',merged_groups)
 
 
     def set(self,key,value):
     def set(self,key,value):
-        Sample.set(self,key,value)
+        self.set(self,key,value)
         if key == 'merged_groups' :  
         if key == 'merged_groups' :  
                
                
-            Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
-            Sample.set(self,'groups',value.sample_group_size)
+            self.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
+            self.set(self,'groups',value.sample_group_size)
     """
     """
     This class will measure risk and account for the existance of a population
     This class will measure risk and account for the existance of a population
     :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
     :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
@@ -314,6 +441,7 @@ class Population(Sample):
         """
         """
         This function requires
         This function requires
         """
         """
+        
         r = self.cache['merged_groups']
         r = self.cache['merged_groups']
         sample_row_count = r.sample_group_size.sum() 
         sample_row_count = r.sample_group_size.sum() 
         #
         #

+ 3 - 3
setup.py

@@ -4,11 +4,11 @@ This is a build file for the
 from setuptools import setup, find_packages
 from setuptools import setup, find_packages
  
  
 setup(
 setup(
-    name = "risk",
-    version = "0.8.1",
+    name = "privacykit",
+    version = "0.9.0",
     author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab",
     author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab",
     author_email = "info@the-phi.com",
     author_email = "info@the-phi.com",
     license = "MIT",
     license = "MIT",
-    packages=['risk'],
+    packages=['privacykit'],
     install_requires = ['numpy','pandas']
     install_requires = ['numpy','pandas']
     )
     )