6 years ago · c3066408c9
--- a/src/pandas_risk.py
+++ b/src/pandas_risk.py
@@ -22,16 +22,108 @@
 
																 """
															
 
																 import pandas as pd
															
 
																 import numpy as np
															
 
																-
															
 
																+import time
															
 
																 @pd.api.extensions.register_dataframe_accessor("deid")
															
 
																 class deid :
															
 
																     """
															
 
																         This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
															
 
																     """
															
 
																     def __init__(self,df):
															
 
																-        self._df = df
															
 
																+        self._df = df.fillna(' ')
															
 
																+    def explore(self,**args):
															
 
																+        """
															
 
																+        This function will perform experimentation by performing a random policies (combinations of attributes)
															
 
																+        This function is intended to explore a variety of policies and evaluate their associated risk.
															
 
																+
															
 
																+        @param pop|sample   data-frame with popublation reference
															
 
																+        @param id       key field that uniquely identifies patient/customer ...
															
 
																+        """
															
 
																+        # id = args['id']
															
 
																+        pop= args['pop'] if 'pop' in args else None
															
 
																+        # if 'columns' in args :
															
 
																+        #     cols = args['columns']
															
 
																+        #     params = {"sample":args['data'],"cols":cols}
															
 
																+        #     if pop is not None :
															
 
																+        #         params['pop'] = pop
															
 
																+        #     return self.evaluate(**params)
															
 
																+        # else :
															
 
																+        #
															
 
																+        # Policies will be generated with a number of runs
															
 
																+        #
															
 
																+        RUNS = args['num_runs'] if 'num_runs' in args else 5
															
 
																+        
															
 
																+        sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
															
 
																+        
															
 
																+        k = sample.columns.size -1 if 'field_count' not in args else int(args['field_count'])
															
 
																+        columns = list(set(sample.columns.tolist()) - set([id]))
															
 
																+        o = pd.DataFrame()
															
 
																+        # pop = args['pop'] if 'pop' in args else None
															
 
																+        for i in np.arange(RUNS):
															
 
																+            n = np.random.randint(2,k)
															
 
																+            
															
 
																+            cols = np.random.choice(columns,n,replace=False).tolist()            
															
 
																+            params = {'sample':sample,'cols':cols}
															
 
																+            if pop is not None :
															
 
																+                params['pop'] = pop
															
 
																+            r = self.evaluate(**params)
															
 
																+            #
															
 
																+            # let's put the policy in place
															
 
																+            p =  pd.DataFrame(1*sample.columns.isin(cols)).T
															
 
																+            p.columns = sample.columns
															
 
																+            o = o.append(r.join(p))
															
 
																+            
															
 
																+        o.index = np.arange(o.shape[0]).astype(np.int64)
															
 
																+
															
 
																+        return o
															
 
																+    def evaluate(self,**args) :
															
 
																+        """
															
 
																+        This function will compute the marketer, if a population is provided it will evaluate the marketer risk relative to both the population and sample
															
 
																+        @param smaple  data-frame with the data to be processed
															
 
																+        @param policy   the columns to be considered.
															
 
																+        @param pop      population dataset
															
 
																+        @params flag    user defined flag (no computation use)
															
 
																+        """
															
 
																+        if (args and 'sample' not in args) or not args :
															
 
																+            x_i = pd.DataFrame(self._df)
															
 
																+        elif args and 'sample' in args :
															
 
																+            x_i = args['sample']
															
 
																+        if (args and 'cols' not in args) or not args :
															
 
																+            cols = x_i.columns.tolist()
															
 
																+            # cols = self._df.columns.tolist()
															
 
																+        elif args and 'cols' in args :
															
 
																+            cols = args['cols']
															
 
																+        flag = args['flag'] if 'flag' in args else 'UNFLAGGED'
															
 
																+        # if args and 'sample' in args :
															
 
																+            
															
 
																+        #     x_i     = pd.DataFrame(self._df)
															
 
																+        # else :
															
 
																+        #     cols    = args['cols'] if 'cols' in args else self._df.columns.tolist()
															
 
																+        # x_i     = x_i.groupby(cols,as_index=False).size().values 
															
 
																+        x_i_values = x_i.groupby(cols,as_index=False).size().values
															
 
																+        SAMPLE_GROUP_COUNT = x_i_values.size
															
 
																+        SAMPLE_FIELD_COUNT = len(cols)
															
 
																+        SAMPLE_POPULATION  = x_i_values.sum()
															
 
																+        
															
 
																+        SAMPLE_MARKETER    = SAMPLE_GROUP_COUNT / np.float64(SAMPLE_POPULATION)
															
 
																+        SAMPLE_PROSECUTOR  = 1/ np.min(x_i_values).astype(np.float64)
															
 
																+        if 'pop' in args :
															
 
																+            Yi = args['pop']            
															
 
																+            y_i= pd.DataFrame({"group_size":Yi.groupby(cols,as_index=False).size()}).reset_index()
															
 
																+            # y_i['group'] = pd.DataFrame({"group_size":args['pop'].groupby(cols,as_index=False).size().values}).reset_index()
															
 
																+            # x_i = pd.DataFrame({"group_size":x_i.groupby(cols,as_index=False).size().values}).reset_index()
															
 
																+            x_i = pd.DataFrame({"group_size":x_i.groupby(cols,as_index=False).size()}).reset_index()
															
 
																+            SAMPLE_RATIO = int(100 * x_i.size/args['pop'].shape[0])
															
 
																+            r = pd.merge(x_i,y_i,on=cols,how='inner')
															
 
																+            r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)
															
 
																+            r['sample %'] = np.repeat(SAMPLE_RATIO,r.shape[0])
															
 
																+            r['tier'] = np.repeat(flag,r.shape[0])
															
 
																+            r['sample marketer'] =  np.repeat(SAMPLE_MARKETER,r.shape[0])
															
 
																+            r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]
															
 
																+        else:
															
 
																+            r = pd.DataFrame({"marketer":[SAMPLE_MARKETER],"prosecutor":[SAMPLE_PROSECUTOR],"field_count":[SAMPLE_FIELD_COUNT],"group_count":[SAMPLE_GROUP_COUNT]})
															
 
																+        return r
															
 
																-    def risk(self,**args):
															
 
																+    def _risk(self,**args):
															
 
																         """
															
 
																             @param  id          name of patient field            
															
 
																             @params num_runs    number of runs (default will be 100)
															
@@ -50,7 +142,7 @@ class deid :
 
																         k = len(columns)
															
 
																         N = self._df.shape[0]
															
 
																         tmp = self._df.fillna(' ')
															
 
																-        np.random.seed(1)
															
 
																+        np.random.seed(int(time.time()) )
															
 
																         for i in range(0,num_runs) :
															
 
																             #
															
@@ -85,6 +177,7 @@ class deid :
 
																                     [
															
 
																                         {
															
 
																                             "group_count":x_.size,
															
 
																+                            
															
 
																                             "patient_count":N,
															
 
																                             "field_count":n,
															
 
																                             "marketer": x_.size / np.float64(np.sum(x_)),
															
--- a/src/risk.py
+++ b/src/risk.py
@@ -146,7 +146,7 @@ class utils :
 
																         return " ".join(SQL).replace(":fields"," , ".join(fields))
															
 
																-class risk :
															
 
																+class SQLRisk :
															
 
																     """
															
 
																         This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
															
 
																     """
															
@@ -186,102 +186,163 @@ class risk :
 
																+class UtilHandler :
															
 
																+    def __init__(self,**args) :
															
 
																+        """
															
 
																+            @param path path to the service account file
															
 
																+            @param dataset    input dataset name
															
 
																+            @param key_field    key_field (e.g person_id)
															
 
																+            @param key_table
															
 
																+        """
															
 
																+        self.path        = args['path']
															
 
																+        self.client      = bq.Client.from_service_account_json(self.path)
															
 
																+        dataset   = args['dataset']
															
 
																+        self.key         = args['key_field'] 
															
 
																-if 'action' in SYS_ARGS and  SYS_ARGS['action'] in ['create','compute','migrate'] :
															
 
																+        self.mytools = utils(client = self.client)
															
 
																+        self.tables  = self.mytools.get_tables(dataset=dataset,client=self.client,key=self.key)
															
 
																+        index = [ self.tables.index(item) for item in self.tables if item['name'] == args['key_table']] [0]
															
 
																+        if index != 0 :
															
 
																+            first = self.tables[0]
															
 
																+            aux = self.tables[index]
															
 
																+            self.tables[0] = aux
															
 
																+            self.tables[index] = first
															
 
																+        if 'filter' in args :
															
 
																+            self.tables = [item for item in self.tables if item['name'] in args['filter']]
															
 
																-    path = SYS_ARGS['path']
															
 
																-    client = bq.Client.from_service_account_json(path)
															
 
																-    i_dataset = SYS_ARGS['i_dataset']
															
 
																-    key = SYS_ARGS['key'] 
															
 
																-    mytools = utils(client = client)
															
 
																-    tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
															
 
																-    # print len(tables)
															
 
																-    # tables = tables[:6]
															
 
																+    def create_table(self,**args):
															
 
																+        """
															
 
																+            @param path absolute filename to save the create statement
															
 
																-    if SYS_ARGS['action'] == 'create' :
															
 
																-        #usage:
															
 
																-        #   create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
															
 
																-        #
															
 
																-        create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
															
 
																-        o_dataset = SYS_ARGS['o_dataset']
															
 
																-        table = SYS_ARGS['table']
															
 
																-        if 'file' in SYS_ARGS :
															
 
																-            f = open(table+'.sql','w')
															
 
																+        """
															
 
																+        create_sql = self.mytools.get_sql(tables=self.tables,key=self.key) #-- The create statement
															
 
																+        # o_dataset = SYS_ARGS['o_dataset']
															
 
																+        # table = SYS_ARGS['table']
															
 
																+        if 'path' in args:
															
 
																+            f = open(args['path'],'w')
															
 
																             f.write(create_sql)
															
 
																             f.close()
															
 
																-        else:
															
 
																-            job = bq.QueryJobConfig()
															
 
																-            job.destination = client.dataset(o_dataset).table(table)
															
 
																-            job.use_query_cache = True
															
 
																-            job.allow_large_results = True 
															
 
																-            job.priority = 'BATCH'
															
 
																-            job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
															
 
																+        return create_sql
															
 
																+    def migrate_tables(self,**args):
															
 
																+        """
															
 
																+            This function will migrate a table from one location to another
															
 
																+            The reason for migration is to be able to reduce a candidate table to only represent a patient by her quasi-identifiers.
															
 
																+            @param dataset  target dataset
															
 
																+        """
															
 
																+        o_dataset = args['dataset'] if 'dataset' in args else None
															
 
																+        p = []
															
 
																+        for table in self.tables:
															
 
																+            sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",self.mytools.get_filtered_table(table,self.key),") as ",table['name']])        
															
 
																+            p.append(sql)
															
 
																+            if o_dataset :
															
 
																+                job = bq.QueryJobConfig()
															
 
																+                job.destination = self.client.dataset(o_dataset).table(table['name'])
															
 
																+                job.use_query_cache = True
															
 
																+                job.allow_large_results = True 
															
 
																+                job.priority = 'INTERACTIVE'
															
 
																+                job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
															
 
																-            r = client.query(create_sql,location='US',job_config=job) 
															
 
																+                r = self.client.query(sql,location='US',job_config=job) 
															
 
																+
															
 
																+                print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
															
 
																+        return p
															
 
																+
															
 
																+# if 'action' in SYS_ARGS and  SYS_ARGS['action'] in ['create','compute','migrate'] :
															
 
																+
															
 
																+#     path = SYS_ARGS['path']
															
 
																+#     client = bq.Client.from_service_account_json(path)
															
 
																+#     i_dataset = SYS_ARGS['i_dataset']
															
 
																+#     key = SYS_ARGS['key'] 
															
 
																+
															
 
																+#     mytools = utils(client = client)
															
 
																+#     tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
															
 
																+#     # print len(tables)
															
 
																+#     # tables = tables[:6]
															
 
																+
															
 
																+#     if SYS_ARGS['action'] == 'create' :
															
 
																+#         #usage:
															
 
																+#         #   create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
															
 
																+#         #
															
 
																+#         create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
															
 
																+#         o_dataset = SYS_ARGS['o_dataset']
															
 
																+#         table = SYS_ARGS['table']
															
 
																+#         if 'file' in SYS_ARGS :
															
 
																+#             f = open(table+'.sql','w')
															
 
																+#             f.write(create_sql)
															
 
																+#             f.close()
															
 
																+#         else:
															
 
																+#             job = bq.QueryJobConfig()
															
 
																+#             job.destination = client.dataset(o_dataset).table(table)
															
 
																+#             job.use_query_cache = True
															
 
																+#             job.allow_large_results = True 
															
 
																+#             job.priority = 'BATCH'
															
 
																+#             job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
															
 
																+
															
 
																+#             r = client.query(create_sql,location='US',job_config=job) 
															
 
																-            print [r.job_id,' ** ',r.state]
															
 
																-    elif SYS_ARGS['action'] == 'migrate' :
															
 
																-        #
															
 
																-        #
															
 
																+#             print [r.job_id,' ** ',r.state]
															
 
																+#     elif SYS_ARGS['action'] == 'migrate' :
															
 
																+#         #
															
 
																+#         #
															
 
																-        o_dataset = SYS_ARGS['o_dataset']
															
 
																-        for table in tables:
															
 
																-            sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
															
 
																-            print ""
															
 
																-            print sql
															
 
																-            print ""
															
 
																-            # job = bq.QueryJobConfig()
															
 
																-            # job.destination = client.dataset(o_dataset).table(table['name'])
															
 
																-            # job.use_query_cache = True
															
 
																-            # job.allow_large_results = True 
															
 
																-            # job.priority = 'INTERACTIVE'
															
 
																-            # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
															
 
																+#         o_dataset = SYS_ARGS['o_dataset']
															
 
																+#         for table in tables:
															
 
																+#             sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
															
 
																+#             print ""
															
 
																+#             print sql
															
 
																+#             print ""
															
 
																+#             # job = bq.QueryJobConfig()
															
 
																+#             # job.destination = client.dataset(o_dataset).table(table['name'])
															
 
																+#             # job.use_query_cache = True
															
 
																+#             # job.allow_large_results = True 
															
 
																+#             # job.priority = 'INTERACTIVE'
															
 
																+#             # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
															
 
																-            # r = client.query(sql,location='US',job_config=job) 
															
 
																+#             # r = client.query(sql,location='US',job_config=job) 
															
 
																-            # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
															
 
																+#             # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
															
 
																-        pass
															
 
																-    else:
															
 
																-        #
															
 
																-        #
															
 
																-        tables  = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]  
															
 
																-        limit   = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
															
 
																-        if tables :            
															
 
																-            risk= risk()
															
 
																-            df  = pd.DataFrame()
															
 
																-            dfs = pd.DataFrame()
															
 
																-            np.random.seed(1)
															
 
																-            for i in range(0,limit) :
															
 
																-                r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
															
 
																-                sql = r['sql']
															
 
																-                dfs = dfs.append(r['stream'],sort=True)
															
 
																-                df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
															
 
																-                # df = df.join(dfs,sort=True)
															
 
																-                df.to_csv(SYS_ARGS['table']+'.csv')
															
 
																-                # dfs.to_csv(SYS_ARGS['table']+'_stream.csv') 
															
 
																-                print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
															
 
																-                time.sleep(2)
															
 
																+#         pass
															
 
																+#     else:
															
 
																+#         #
															
 
																+#         #
															
 
																+#         tables  = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]  
															
 
																+#         limit   = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
															
 
																+#         if tables :            
															
 
																+#             risk= risk()
															
 
																+#             df  = pd.DataFrame()
															
 
																+#             dfs = pd.DataFrame()
															
 
																+#             np.random.seed(1)
															
 
																+#             for i in range(0,limit) :
															
 
																+#                 r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
															
 
																+#                 sql = r['sql']
															
 
																+#                 dfs = dfs.append(r['stream'],sort=True)
															
 
																+#                 df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
															
 
																+#                 # df = df.join(dfs,sort=True)
															
 
																+#                 df.to_csv(SYS_ARGS['table']+'.csv')
															
 
																+#                 # dfs.to_csv(SYS_ARGS['table']+'_stream.csv') 
															
 
																+#                 print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
															
 
																+#                 time.sleep(2)
															
 
																-else:
															
 
																-    print 'ERROR'
															
 
																-    pass
															
 
																+# else:
															
 
																+#     print 'ERROR'
															
 
																+#     pass
															
 
																-# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
															
 
																-# tables = r.get_tables('raw','person_id')
															
 
																-# sql = r.get_sql(tables=tables[:3],key='person_id')
															
 
																-# #
															
 
																-# # let's post this to a designated location
															
 
																-# #
															
 
																-# f = open('foo.sql','w')
															
 
																-# f.write(sql)
															
 
																-# f.close()
															
 
																-# r.get_sql(tables=tables,key='person_id')
															
 
																-# p = r.compute()
															
 
																-# print p
															
 
																-# p.to_csv("risk.csv")
															
 
																-# r.write('foo.sql')
															
 
																+# # r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
															
 
																+# # tables = r.get_tables('raw','person_id')
															
 
																+# # sql = r.get_sql(tables=tables[:3],key='person_id')
															
 
																+# # #
															
 
																+# # # let's post this to a designated location
															
 
																+# # #
															
 
																+# # f = open('foo.sql','w')
															
 
																+# # f.write(sql)
															
 
																+# # f.close()
															
 
																+# # r.get_sql(tables=tables,key='person_id')
															
 
																+# # p = r.compute()
															
 
																+# # print p
															
 
																+# # p.to_csv("risk.csv")
															
 
																+# # r.write('foo.sql')