2 سال پیش · d00445bab8
--- a/privacykit/__init__.py
+++ b/privacykit/__init__.py
@@ -0,0 +1,75 @@
 
				+"""
			
 
				+# Re-Identification Risk
			
 
				+
			
 
				+This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** 
			
 
				+The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. 
			
 
				+References for the risk measures can be found on 
			
 
				+ - http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf
			
 
				+ - https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
			
 
				+
			
 
				+There are two modes available :
			
 
				+    
			
 
				+**explore:**
			
 
				+
			
 
				+Here the assumption is that we are not sure of the attributes to be disclosed, the framework will randomly generate random combinations of attributes and evaluate them accordingly as it provides all the measures of risk. 
			
 
				+
			
 
				+**evaluation**
			
 
				+
			
 
				+Here the assumption is that we are clear on the sets of attributes to be used and we are interested in computing the associated risk.
			
 
				+
			
 
				+
			
 
				+### Four risk measures are computed :
			
 
				+
			
 
				+    - Marketer risk
			
 
				+    - Prosecutor risk
			
 
				+    - Journalist risk
			
 
				+    - Pitman Risk
			
 
				+
			
 
				+### Usage:
			
 
				+
			
 
				+Install this package using pip as follows :
			
 
				+
			
 
				+Stable :
			
 
				+    
			
 
				+    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git
			
 
				+    
			
 
				+    
			
 
				+Latest Development (not fully tested):
			
 
				+    
			
 
				+    pip install git+https://hiplab.mc.vanderbilt.edu/git/steve/deid-risk.git@risk
			
 
				+    
			
 
				+The framework will depend on pandas and numpy (for now). Below is a basic sample to get started quickly.
			
 
				+
			
 
				+
			
 
				+    import numpy as np
			
 
				+    import pandas as pd
			
 
				+    import risk
			
 
				+
			
 
				+    mydf = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),50),"y":np.random.choice( np.random.randint(1,10),50),"z":np.random.choice( np.random.randint(1,10),50),"r":np.random.choice( np.random.randint(1,10),50)  })
			
 
				+    print (mydf.risk.evaluate())
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #
			
 
				+    # computing journalist and pitman
			
 
				+    #   - Insure the population size is much greater than the sample size 
			
 
				+    #   - Insure the fields are identical in both sample and population
			
 
				+    #
			
 
				+    pop = pd.DataFrame({"x":np.random.choice( np.random.randint(1,10),150),"y":np.random.choice( np.random.randint(1,10),150) ,"z":np.random.choice( np.random.randint(1,10),150),"r":np.random.choice( np.random.randint(1,10),150)})
			
 
				+    print (mydf.risk.evaluate(pop=pop))
			
 
				+
			
 
				+
			
 
				+@TODO:
			
 
				+    - Evaluation of how sparse attributes are (the ratio of non-null over rows)
			
 
				+    - Have a smart way to drop attributes (based on the above in random policy search)
			
 
				+Basic examples that illustrate usage of the the framework are in the notebook folder. The example is derived from 
			
 
				+
			
 
				+	
			
 
				+"""
			
 
				+import sys
			
 
				+# if sys.version_info.major == 2:
			
 
				+# 	from privacykit.risk import deid
			
 
				+# else:
			
 
				+#   	from privackykit.risk import deid
			
 
				+import privacykit
			
 
				+from privacykit.risk import deid
			
--- a/privacykit/risk.py
+++ b/privacykit/risk.py
@@ -0,0 +1,324 @@
 
				+"""
			
 
				+    Health Information Privacy Lab
			
 
				+    Brad. Malin, Weiyi Xia, Steve L. Nyemba
			
 
				+
			
 
				+    This framework computes re-identification risk of a dataset assuming the data being shared can be loaded into a dataframe (pandas)
			
 
				+    The framework will compute the following risk measures:
			
 
				+        - marketer
			
 
				+        - prosecutor
			
 
				+        - pitman
			
 
				+
			
 
				+    References :
			
 
				+        https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf
			
 
				+
			
 
				+    This framework integrates pandas (for now) as an extension and can be used in two modes :
			
 
				+    Experimental mode
			
 
				+        Here the assumption is that we are not sure of the attributes to be disclosed, the framework will explore a variety of combinations and associate risk measures every random combinations
			
 
				+
			
 
				+    Evaluation mode
			
 
				+        The evaluation mode assumes the set of attributes given are known and thus will evaluate risk for a subset of attributes.
			
 
				+
			
 
				+    features :
			
 
				+        - determine viable fields (quantifiable in terms of uniqueness). This is a way to identify fields that can act as identifiers.
			
 
				+        - explore and evaluate risk of a sample dataset against a known population dataset
			
 
				+        - explore and evaluate risk on a sample dataset
			
 
				+    Usage:
			
 
				+    from pandas_risk import *
			
 
				+
			
 
				+    mydataframe = pd.DataFrame('/myfile.csv')
			
 
				+    resp = mydataframe.risk.evaluate(id=<name of patient field>,num_runs=<number of runs>,cols=[])
			
 
				+    resp = mydataframe.risk.explore(id=<name of patient field>,num_runs=<number of runs>,cols=[])
			
 
				+
			
 
				+
			
 
				+    @TODO:
			
 
				+        - Provide a selected number of fields and risk will be computed for those fields.
			
 
				+        - include journalist risk
			
 
				+
			
 
				+"""
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import logging	
			
 
				+import json
			
 
				+from datetime import datetime
			
 
				+import sys
			
 
				+
			
 
				+from itertools import combinations
			
 
				+
			
 
				+@pd.api.extensions.register_dataframe_accessor("risk")
			
 
				+class deid :
			
 
				+
			
 
				+    """
			
 
				+    This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
			
 
				+    """
			
 
				+    def __init__(self,df):
			
 
				+        self._df = df.fillna(' ')
			
 
				+        #
			
 
				+        # Let's get the distribution of the values so we know what how unique the fields are
			
 
				+        #
			
 
				+        values = df.apply(lambda col: col.unique().size / df.shape[0])
			
 
				+        self._dinfo = dict(zip(df.columns.tolist(),values))
			
 
				+
			
 
				+    def explore(self,**args):
			
 
				+        """
			
 
				+        This function will perform experimentation by performing a random policies (combinations of attributes)
			
 
				+        This function is intended to explore a variety of policies and evaluate their associated risk.
			
 
				+
			
 
				+        :pop|sample     data-frame with population or sample reference
			
 
				+        :field_count    number of fields to randomly select
			
 
				+        :strict         if set the field_count is exact otherwise field_count is range from 2-field_count
			
 
				+        :num_runs       number of runs (by default 5)
			
 
				+        """
			
 
				+        
			
 
				+        pop= args['pop'] if 'pop' in args else None
			
 
				+        
			
 
				+        if 'pop_size' in args :
			
 
				+            pop_size = np.float64(args['pop_size'])
			
 
				+        else:
			
 
				+            pop_size = -1
			
 
				+        
			
 
				+        
			
 
				+        #
			
 
				+        # Policies will be generated with a number of runs
			
 
				+        #
			
 
				+        RUNS = args['num_runs'] if 'num_runs' in args else 5
			
 
				+        
			
 
				+        sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
			
 
				+        
			
 
				+        k = sample.columns.size if 'field_count' not in args else int(args['field_count']) +1
			
 
				+        #
			
 
				+        # remove fields that are unique, they function as identifiers
			
 
				+        #
			
 
				+        if 'id' in args :
			
 
				+            id = args['id']
			
 
				+            columns = list(set(sample.columns.tolist()) - set([id]))
			
 
				+        else:
			
 
				+            columns = sample.columns.tolist()
			
 
				+        
			
 
				+        # If columns are not specified we can derive them from self._dinfo
			
 
				+        #   given the distribution all fields that are < 1 will be accounted for
			
 
				+        # columns = args['cols'] if 'cols' in args else [key for key in self._dinfo if self._dinfo[key] < 1]
			
 
				+        
			
 
				+        o = pd.DataFrame()
			
 
				+        columns = [key for key in self._dinfo if self._dinfo[key] < 1]
			
 
				+        _policy_count = 2 if 'policy_count' not in args else int(args['policy_count'])
			
 
				+        
			
 
				+        _policies = []
			
 
				+        _index = 0
			
 
				+        for size in np.arange(2,len(columns)) :
			
 
				+            p = list(combinations(columns,size))            
			
 
				+            p = (np.array(p)[ np.random.choice( len(p), _policy_count)].tolist())
			
 
				+            flag = 'Policy_'+str(_index)
			
 
				+            _index += 1
			
 
				+            for cols in p :
			
 
				+                r = self.evaluate(sample=sample,cols=cols,flag = flag)
			
 
				+                p =  pd.DataFrame(1*sample.columns.isin(cols)).T
			
 
				+                p.columns = sample.columns
			
 
				+                o = pd.concat([o,r.join(p)])
			
 
				+        
			
 
				+           
			
 
				+        # for i in np.arange(RUNS):
			
 
				+        #     if 'strict' not in args or ('strict' in args and args['strict'] is False):
			
 
				+        #         n = np.random.randint(2,k)
			
 
				+        #     else:
			
 
				+        #         n = args['field_count']
			
 
				+        #     cols = np.random.choice(columns,n,replace=False).tolist()            
			
 
				+        #     params = {'sample':sample,'cols':cols}
			
 
				+        #     if pop is not None :
			
 
				+        #         params['pop'] = pop
			
 
				+        #     if pop_size > 0  :
			
 
				+        #         params['pop_size'] = pop_size
			
 
				+
			
 
				+        #     r = self.evaluate(**params)
			
 
				+        #     #
			
 
				+        #     # let's put the policy in place
			
 
				+        #     p =  pd.DataFrame(1*sample.columns.isin(cols)).T
			
 
				+        #     p.columns = sample.columns
			
 
				+        #     # o = o.append(r.join(p))
			
 
				+        #     o = pd.concat([o,r.join(p)])
			
 
				+
			
 
				+            
			
 
				+        o.index = np.arange(o.shape[0]).astype(np.int64)
			
 
				+
			
 
				+        return o
			
 
				+    def evaluate(self, **args):
			
 
				+        """
			
 
				+        This function has the ability to evaluate risk associated with either a population or a sample dataset
			
 
				+        :sample sample dataset
			
 
				+        :pop    population dataset
			
 
				+        :cols   list of columns of interest or policies
			
 
				+        :flag   user provided flag for the context of the evaluation
			
 
				+        """
			
 
				+        if 'sample' in args :
			
 
				+            sample = pd.DataFrame(args['sample'])
			
 
				+        else:
			
 
				+            sample = pd.DataFrame(self._df)
			
 
				+
			
 
				+        if not args  or 'cols' not in args:
			
 
				+            # cols = sample.columns.tolist()
			
 
				+            cols = [key for key in self._dinfo if self._dinfo[key] < 1]
			
 
				+        elif args and 'cols' in args:
			
 
				+            cols = args['cols']
			
 
				+        #
			
 
				+        #
			
 
				+      
			
 
				+        flag = 'UNFLAGGED' if 'flag' not in args else args['flag']
			
 
				+        #
			
 
				+        # @TODO: auto select the columns i.e removing the columns that will have the effect of an identifier
			
 
				+        #
			
 
				+        # if 'population' in args :
			
 
				+        #     pop = pd.DataFrame(args['population'])
			
 
				+        r = {"flag":flag}
			
 
				+        # if sample :
			
 
				+        
			
 
				+        handle_sample   = Sample()        
			
 
				+        xi              = sample.groupby(cols,as_index=False).count().values
			
 
				+        
			
 
				+        handle_sample.set('groups',xi)
			
 
				+        if 'pop_size' in args :
			
 
				+            pop_size = np.float64(args['pop_size'])
			
 
				+        else:
			
 
				+            pop_size = -1
			
 
				+        #
			
 
				+        #-- The following conditional line is to address the labels that will be returned
			
 
				+        # @TODO: Find a more elegant way of doing this.
			
 
				+        #
			
 
				+
			
 
				+        if 'pop' in args :
			
 
				+            label_market = 'sample marketer'
			
 
				+            label_prosec = 'sample prosecutor'
			
 
				+            label_groupN = 'sample group count'
			
 
				+            label_unique = 'sample journalist' #'sample unique ratio'
			
 
				+            # r['sample marketer']   = handle_sample.marketer()
			
 
				+            # r['sample prosecutor'] = handle_sample.prosecutor()
			
 
				+            # r['sample unique ratio']     = handle_sample.unique_ratio()
			
 
				+            # r['sample group count'] = xi.size
			
 
				+            # r['sample group count'] = len(xi)
			
 
				+        else:
			
 
				+            label_market = 'marketer'
			
 
				+            label_prosec = 'prosecutor'
			
 
				+            label_groupN = 'group count'
			
 
				+            label_unique = 'journalist' #'unique ratio'
			
 
				+            # r['marketer']   = handle_sample.marketer()
			
 
				+            # r['prosecutor'] = handle_sample.prosecutor()
			
 
				+            # r['unique ratio']     = handle_sample.unique_ratio()
			
 
				+            # r['group count'] =  xi.size
			
 
				+            # r['group count'] =  len(xi)
			
 
				+            if pop_size > 0 :
			
 
				+                handle_sample.set('pop_size',pop_size)
			
 
				+                r['pitman risk'] = handle_sample.pitman()
			
 
				+        r[label_market]   = handle_sample.marketer()
			
 
				+        r[label_unique]     = handle_sample.unique_ratio()
			
 
				+        r[label_prosec] = handle_sample.prosecutor()
			
 
				+        r[label_groupN] =  len(xi)
			
 
				+        
			
 
				+        if 'pop' in args :
			
 
				+            xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
			
 
				+            yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
			
 
				+            merged_groups = pd.merge(xi,yi,on=cols,how='inner')
			
 
				+            handle_population= Population()            
			
 
				+            handle_population.set('merged_groups',merged_groups)
			
 
				+            
			
 
				+            r['pop. marketer'] = handle_population.marketer()            
			
 
				+            r['pitman risk'] = handle_population.pitman()
			
 
				+            r['pop. group size'] = np.unique(yi.population_group_size).size
			
 
				+        #
			
 
				+        # At this point we have both columns for either sample,population or both
			
 
				+        #
			
 
				+        r['field count'] = len(cols)
			
 
				+        return pd.DataFrame([r])
			
 
				+
			
 
				+class Risk :
			
 
				+    """
			
 
				+    This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
			
 
				+        - Sample        computes risk associated with a sample dataset only
			
 
				+        - Population    computes risk associated with a population
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        self.cache = {}        
			
 
				+    def set(self,key,value):        
			
 
				+        if id not in self.cache :
			
 
				+            self.cache[id] = {}
			
 
				+        self.cache[key] = value
			
 
				+
			
 
				+class Sample(Risk):
			
 
				+    """
			
 
				+    This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
			
 
				+    This class can optionally add pitman risk if the population size is known.
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        Risk.__init__(self)
			
 
				+    def marketer(self):
			
 
				+        """
			
 
				+        computing marketer risk for sample dataset
			
 
				+        """
			
 
				+        
			
 
				+            
			
 
				+        groups = self.cache['groups']
			
 
				+        # group_count = groups.size
			
 
				+        # row_count   = groups.sum()
			
 
				+        group_count = len(groups)
			
 
				+        row_count = np.sum([_g[-1] for _g in groups])
			
 
				+        return group_count / np.float64(row_count)
			
 
				+
			
 
				+    def prosecutor(self):
			
 
				+        """
			
 
				+        The prosecutor risk consists in determining 1 over the smallest group size
			
 
				+        It identifies if there is at least one record that is unique
			
 
				+        """
			
 
				+        groups = self.cache['groups']
			
 
				+        _min = np.min([_g[-1] for _g in groups])
			
 
				+        # return 1 / np.float64(groups.min())
			
 
				+        return 1/ np.float64(_min)
			
 
				+    def unique_ratio(self):
			
 
				+        groups = self.cache['groups']        
			
 
				+        # row_count = groups.sum()
			
 
				+        row_count = np.sum([_g[-1] for _g in groups])
			
 
				+        # return groups[groups == 1].sum() / np.float64(row_count)
			
 
				+        values = [_g[-1] for _g in groups if _g[-1] == 1]
			
 
				+        
			
 
				+        return np.sum(values) / np.float64(row_count)
			
 
				+
			
 
				+    def pitman(self):
			
 
				+        """
			
 
				+        This function will approximate pitman de-identification risk based on pitman sampling
			
 
				+        """
			
 
				+        groups = self.cache['groups']
			
 
				+        si = groups[groups == 1].size
			
 
				+        # u = groups.size
			
 
				+        u = len(groups)
			
 
				+        alpha = np.divide(si , np.float64(u) )
			
 
				+        row_count = np.sum([_g[-1] for _g in groups])
			
 
				+        # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
			
 
				+        f = np.divide(row_count, np.float64(self.cache['pop_size']))
			
 
				+        return np.power(f,1-alpha)
			
 
				+
			
 
				+class Population(Sample):
			
 
				+    """
			
 
				+    This class will compute risk for datasets that have population information or datasets associated with them.
			
 
				+    This computation includes pitman risk (it requires minimal information about population)
			
 
				+    """
			
 
				+    def __init__(self,**args):
			
 
				+        Sample.__init__(self)
			
 
				+
			
 
				+    def set(self,key,value):
			
 
				+        Sample.set(self,key,value)
			
 
				+        if key == 'merged_groups' :  
			
 
				+               
			
 
				+            Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
			
 
				+            Sample.set(self,'groups',value.sample_group_size)
			
 
				+    """
			
 
				+    This class will measure risk and account for the existance of a population
			
 
				+    :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
			
 
				+    """
			
 
				+    def marketer(self):
			
 
				+        """
			
 
				+        This function requires
			
 
				+        """
			
 
				+        r = self.cache['merged_groups']
			
 
				+        sample_row_count = r.sample_group_size.sum() 
			
 
				+        #
			
 
				+        # @TODO : make sure the above line is size (not sum)
			
 
				+        # sample_row_count = r.sample_group_size.size
			
 
				+        return r.apply(lambda row: (row.sample_group_size / np.float64(row.population_group_size)) /np.float64(sample_row_count) ,axis=1).sum()
			
 
				+
			
 
				+