浏览代码

modular features, documentation new version

Steve Nyemba 2 年之前
父节点
当前提交
7f8754b5f1
共有 3 个文件被更改,包括 166 次插入23 次删除
  1. 6 4
      README.md
  2. 159 18
      privacykit/risk.py
  3. 1 1
      setup.py

+ 6 - 4
README.md

@@ -3,6 +3,8 @@
 This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
 This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
 The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
 The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
 
 
+
+
 There are two modes available :
 There are two modes available :
     
     
 **explore:**
 **explore:**
@@ -16,10 +18,10 @@ Here the assumption is that we are clear on the sets of attributes to be used an
 
 
 ### Four risk measures are computed :
 ### Four risk measures are computed :
 
 
-    - Marketer risk
-    - Prosecutor risk
-    - Journalist risk
-    - Pitman Risk
+- Marketer risk
+- Prosecutor risk
+- Journalist risk
+- Pitman Risk [Video tutorial,by Dr. Weiyi Xia](https://www.loom.com/share/173e109ecac64d37a54f09b103bc6681) and [Publication by Dr. Nobuaki Hoshino](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
 
 
 ### Usage:
 ### Usage:
 
 

+ 159 - 18
privacykit/risk.py

@@ -43,6 +43,10 @@ from datetime import datetime
 import sys
 import sys
 
 
 from itertools import combinations
 from itertools import combinations
+# class Compute:
+#     pass
+# class Population(Compute):
+#     pass
 
 
 @pd.api.extensions.register_dataframe_accessor("risk")
 @pd.api.extensions.register_dataframe_accessor("risk")
 class deid :
 class deid :
@@ -57,6 +61,16 @@ class deid :
         #
         #
         values = df.apply(lambda col: col.unique().size / df.shape[0])
         values = df.apply(lambda col: col.unique().size / df.shape[0])
         self._dinfo = dict(zip(df.columns.tolist(),values))
         self._dinfo = dict(zip(df.columns.tolist(),values))
+        # self.sample = self._df
+        self.init(sample=self._df)
+    def init(self,**_args):
+        _sample = _args['sample'] if 'sample' in _args else self._df
+        _columns = [] if 'columns' not in _args else _args['columns']
+        if _columns :
+            self._compute = Compute(sample = _sample,columns=_columns)
+        else:
+            self._comput = Compute(sample=_sample)
+        self._pcompute= Population()  
 
 
     def explore(self,**args):
     def explore(self,**args):
         """
         """
@@ -115,7 +129,9 @@ class deid :
                 p =  pd.DataFrame(1*sample.columns.isin(cols)).T
                 p =  pd.DataFrame(1*sample.columns.isin(cols)).T
                 p.columns = sample.columns
                 p.columns = sample.columns
                 o = pd.concat([o,r.join(p)])
                 o = pd.concat([o,r.join(p)])
-                o['attr'] = ','.join(cols)
+
+                o['attributes'] = ','.join(cols)
+                # o['attr'] = ','.join(r.apply())
                 _index += 1
                 _index += 1
         #
         #
         # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr
         # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr
@@ -127,7 +143,23 @@ class deid :
         o.index = np.arange(o.shape[0]).astype(np.int64)
         o.index = np.arange(o.shape[0]).astype(np.int64)
         o = o.rename(columns={'flag':'policies'})
         o = o.rename(columns={'flag':'policies'})
         return o
         return o
-    def evaluate(self, **args):
+    def evaluate(self,**_args):
+        _measure = {}
+
+        self.init(**_args)
+        _names = ['marketer','journalist','prosecutor'] #+ (['pitman'] if 'pop_size' in _args else [])
+        for label in _names :
+            _pointer = getattr(self,label)
+            _measure[label] = _pointer(**_args)
+        
+        _measure['fields'] = self._compute.cache['count']['fields']
+        _measure['groups'] = self._compute.cache['count']['groups']
+        _measure['rows'] = self._compute.cache['count']['rows']
+        if 'attr' in _args :
+            _measure = dict(_args['attr'],**_measure)
+
+        return pd.DataFrame([_measure])
+    def _evaluate(self, **args):
         """
         """
         This function has the ability to evaluate risk associated with either a population or a sample dataset
         This function has the ability to evaluate risk associated with either a population or a sample dataset
         :sample sample dataset
         :sample sample dataset
@@ -157,7 +189,7 @@ class deid :
         r = {"flag":flag}
         r = {"flag":flag}
         # if sample :
         # if sample :
         
         
-        handle_sample   = Sample()        
+        handle_sample   = Compute()        
         xi              = sample.groupby(cols,as_index=False).count().values
         xi              = sample.groupby(cols,as_index=False).count().values
         
         
         handle_sample.set('groups',xi)
         handle_sample.set('groups',xi)
@@ -213,7 +245,83 @@ class deid :
         #
         #
         r['field count'] = len(cols)
         r['field count'] = len(cols)
         return pd.DataFrame([r])
         return pd.DataFrame([r])
+    
+    def marketer(self,**_args):
+        """
+        This function delegates the calls to compute marketer risk of a given dataset or sample
+        :sample     optional sample dataset
+        :columns    optional columns of the dataset, if non is provided and inference will be made using non-unique columns
+        """
+        if 'pop' not in _args :
+            if not 'sample' in _args and not 'columns' in _args :
+                # _handler =  self._compute
+                pass
+            else:
+                
+                self.init(**_args)
+                # _handler = Compute(**_args)
+            _handler =  self._compute
 
 
+        else:
+            #
+            # Computing population estimates for the population
+            self._pcompute.init(**_args)
+            handler = self._pcompute
+        return _handler.marketer()
+    def journalist(self,**_args):
+        """
+        This function delegates the calls to compute journalist risk of a given dataset or sample
+        :sample     optional sample dataset
+        :columns    optional columns of the dataset, if non is provided and inference will be made using non-unique columns
+        """
+        if 'pop' not in _args :
+            if not 'sample' in _args and not 'columns' in _args :
+                _handler =  self._compute
+            else:
+                self.init(**_args)
+                # _handler = Compute(**_args)
+            _handler = self._compute
+                # return _compute.journalist()
+        else:
+            self._pcompute.init(**_args)
+            _handler = self._pcompute
+        return _handler.journalist()
+    def prosecutor(self,**_args):
+        """
+        This function delegates the calls to compute prosecutor risk of a given dataset or sample
+        :sample     optional sample dataset
+        :columns    optional columns of the dataset, if non is provided and inference will be made using non-unique columns
+        """
+        if 'pop' not in _args :
+            if not 'sample' in _args and not 'columns' in _args :
+                # _handler =  self._compute
+                pass
+            else:
+                self.init(**_args)
+                # _handler = Compute(**_args)
+            _handler =  self._compute
+                
+        else:
+            self._pcompute.init(**_args)
+            _handler = self._pcompute
+        return _handler.prosecutor()
+    def pitman(self,**_args):
+        
+        if 'population' not in _args :
+            pop_size = int(_args['pop_size'])
+            self._compute.set('pop_size',pop_size)
+            _handler =  self._compute;
+        else:
+            self._pcompute.init(**_args)
+            _handler = self._pcompute
+        
+        return _handler.pitman()
+        
+        # xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
+        # yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
+        # merged_groups = pd.merge(xi,yi,on=cols,how='inner')
+        # handle_population= Population()            
+        # handle_population.set('merged_groups',merged_groups)
 class Risk :
 class Risk :
     """
     """
     This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
     This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
@@ -227,24 +335,44 @@ class Risk :
             self.cache[id] = {}
             self.cache[id] = {}
         self.cache[key] = value
         self.cache[key] = value
 
 
-class Sample(Risk):
+class Compute(Risk):
     """
     """
     This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
     This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
     This class can optionally add pitman risk if the population size is known.
     This class can optionally add pitman risk if the population size is known.
     """
     """
-    def __init__(self):
-        Risk.__init__(self)
+    def __init__(self,**_args):
+        super().__init__()
+        self._sample = _args['sample'] if 'sample' in _args else pd.DataFrame()
+        self._columns= _args['columns'] if 'columns' in _args else None
+        self.cache['count']  = {'groups':0,'fields':0,'rows':0}
+        if not self._columns :
+            values = self._sample.apply(lambda col: col.unique().size / self._sample.shape[0])            
+            self._dinfo = dict(zip(self._sample.columns.tolist(),values))
+            self._columns = [key for key in self._dinfo if self._dinfo[key] < 1]
+        #
+        # At this point we have all the columns that are valid candidates even if the user didn't specify them
+        self.cache['count']['fields'] = len(self._columns)
+        if self._sample.shape[0] > 0 and self._columns:
+            _sample = _args ['sample']
+            _groups = self._sample.groupby(self._columns,as_index=False).count().values
+            self.set('groups',_groups)
+    
+            self.cache['count']['groups']  = len(_groups)
+            self.cache['count']['rows']    = np.sum([_g[-1] for _g in _groups])
+            
     def marketer(self):
     def marketer(self):
         """
         """
         computing marketer risk for sample dataset
         computing marketer risk for sample dataset
         """
         """
         
         
-            
+        
         groups = self.cache['groups']
         groups = self.cache['groups']
         # group_count = groups.size
         # group_count = groups.size
         # row_count   = groups.sum()
         # row_count   = groups.sum()
-        group_count = len(groups)
-        row_count = np.sum([_g[-1] for _g in groups])
+        # group_count = len(groups)
+        group_count = self.cache['count']['groups']
+        # row_count = np.sum([_g[-1] for _g in groups])
+        row_count = self.cache['count']['rows']
         return group_count / np.float64(row_count)
         return group_count / np.float64(row_count)
 
 
     def prosecutor(self):
     def prosecutor(self):
@@ -259,40 +387,52 @@ class Sample(Risk):
     def unique_ratio(self):
     def unique_ratio(self):
         groups = self.cache['groups']        
         groups = self.cache['groups']        
         # row_count = groups.sum()
         # row_count = groups.sum()
-        row_count = np.sum([_g[-1] for _g in groups])
+        # row_count = np.sum([_g[-1] for _g in groups])
+        row_count = self.cache['count']['rows']
         # return groups[groups == 1].sum() / np.float64(row_count)
         # return groups[groups == 1].sum() / np.float64(row_count)
         values = [_g[-1] for _g in groups if _g[-1] == 1]
         values = [_g[-1] for _g in groups if _g[-1] == 1]
         
         
         return np.sum(values) / np.float64(row_count)
         return np.sum(values) / np.float64(row_count)
-
+    def journalist(self):
+        return self.unique_ratio()
     def pitman(self):
     def pitman(self):
         """
         """
         This function will approximate pitman de-identification risk based on pitman sampling
         This function will approximate pitman de-identification risk based on pitman sampling
         """
         """
+        
         groups = self.cache['groups']
         groups = self.cache['groups']
+        print (self.cache['pop_size'])
         si = groups[groups == 1].size
         si = groups[groups == 1].size
         # u = groups.size
         # u = groups.size
         u = len(groups)
         u = len(groups)
         alpha = np.divide(si , np.float64(u) )
         alpha = np.divide(si , np.float64(u) )
-        row_count = np.sum([_g[-1] for _g in groups])
+        # row_count = np.sum([_g[-1] for _g in groups])
+        row_count = self.cache['count']['rows']
+
         # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
         # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
         f = np.divide(row_count, np.float64(self.cache['pop_size']))
         f = np.divide(row_count, np.float64(self.cache['pop_size']))
         return np.power(f,1-alpha)
         return np.power(f,1-alpha)
 
 
-class Population(Sample):
+class Population(Compute):
     """
     """
     This class will compute risk for datasets that have population information or datasets associated with them.
     This class will compute risk for datasets that have population information or datasets associated with them.
     This computation includes pitman risk (it requires minimal information about population)
     This computation includes pitman risk (it requires minimal information about population)
     """
     """
-    def __init__(self,**args):
-        Sample.__init__(self)
+    def __init__(self,**_args):
+        super().__init__(**_args)
+
+    def init(self,**_args):
+        xi = pd.DataFrame({"sample_group_size":self._sample.groupby(self._columns,as_index=False).count()}).reset_index()
+        yi = pd.DataFrame({"population_group_size":_args['population'].groupby(self._columns,as_index=False).size()}).reset_index()
+        merged_groups = pd.merge(xi,yi,on=self._columns,how='inner')                   
+        self.set('merged_groups',merged_groups)
 
 
     def set(self,key,value):
     def set(self,key,value):
-        Sample.set(self,key,value)
+        self.set(self,key,value)
         if key == 'merged_groups' :  
         if key == 'merged_groups' :  
                
                
-            Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
-            Sample.set(self,'groups',value.sample_group_size)
+            self.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
+            self.set(self,'groups',value.sample_group_size)
     """
     """
     This class will measure risk and account for the existance of a population
     This class will measure risk and account for the existance of a population
     :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
     :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
@@ -301,6 +441,7 @@ class Population(Sample):
         """
         """
         This function requires
         This function requires
         """
         """
+        
         r = self.cache['merged_groups']
         r = self.cache['merged_groups']
         sample_row_count = r.sample_group_size.sum() 
         sample_row_count = r.sample_group_size.sum() 
         #
         #

+ 1 - 1
setup.py

@@ -5,7 +5,7 @@ from setuptools import setup, find_packages
  
  
 setup(
 setup(
     name = "privacykit",
     name = "privacykit",
-    version = "0.8.1",
+    version = "0.9.0",
     author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab",
     author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab",
     author_email = "info@the-phi.com",
     author_email = "info@the-phi.com",
     license = "MIT",
     license = "MIT",