|
@@ -146,7 +146,7 @@ class utils :
|
|
|
|
|
|
return " ".join(SQL).replace(":fields"," , ".join(fields))
|
|
|
|
|
|
-class risk :
|
|
|
+class SQLRisk :
|
|
|
"""
|
|
|
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
|
|
|
"""
|
|
@@ -186,102 +186,163 @@ class risk :
|
|
|
|
|
|
|
|
|
|
|
|
+class UtilHandler :
|
|
|
+ def __init__(self,**args) :
|
|
|
+ """
|
|
|
+ @param path path to the service account file
|
|
|
+ @param dataset input dataset name
|
|
|
+ @param key_field key_field (e.g person_id)
|
|
|
+ @param key_table
|
|
|
|
|
|
+ """
|
|
|
+ self.path = args['path']
|
|
|
+ self.client = bq.Client.from_service_account_json(self.path)
|
|
|
+ dataset = args['dataset']
|
|
|
+ self.key = args['key_field']
|
|
|
|
|
|
-if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
|
|
|
+ self.mytools = utils(client = self.client)
|
|
|
+ self.tables = self.mytools.get_tables(dataset=dataset,client=self.client,key=self.key)
|
|
|
+ index = [ self.tables.index(item) for item in self.tables if item['name'] == args['key_table']] [0]
|
|
|
+ if index != 0 :
|
|
|
+ first = self.tables[0]
|
|
|
+ aux = self.tables[index]
|
|
|
+ self.tables[0] = aux
|
|
|
+ self.tables[index] = first
|
|
|
+ if 'filter' in args :
|
|
|
+ self.tables = [item for item in self.tables if item['name'] in args['filter']]
|
|
|
|
|
|
- path = SYS_ARGS['path']
|
|
|
- client = bq.Client.from_service_account_json(path)
|
|
|
- i_dataset = SYS_ARGS['i_dataset']
|
|
|
- key = SYS_ARGS['key']
|
|
|
|
|
|
- mytools = utils(client = client)
|
|
|
- tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
|
|
|
- # print len(tables)
|
|
|
- # tables = tables[:6]
|
|
|
+ def create_table(self,**args):
|
|
|
+ """
|
|
|
+ @param path absolute filename to save the create statement
|
|
|
|
|
|
- if SYS_ARGS['action'] == 'create' :
|
|
|
- #usage:
|
|
|
- # create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
|
|
|
- #
|
|
|
- create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
|
|
|
- o_dataset = SYS_ARGS['o_dataset']
|
|
|
- table = SYS_ARGS['table']
|
|
|
- if 'file' in SYS_ARGS :
|
|
|
- f = open(table+'.sql','w')
|
|
|
+ """
|
|
|
+ create_sql = self.mytools.get_sql(tables=self.tables,key=self.key) #-- The create statement
|
|
|
+ # o_dataset = SYS_ARGS['o_dataset']
|
|
|
+ # table = SYS_ARGS['table']
|
|
|
+ if 'path' in args:
|
|
|
+ f = open(args['path'],'w')
|
|
|
f.write(create_sql)
|
|
|
f.close()
|
|
|
- else:
|
|
|
- job = bq.QueryJobConfig()
|
|
|
- job.destination = client.dataset(o_dataset).table(table)
|
|
|
- job.use_query_cache = True
|
|
|
- job.allow_large_results = True
|
|
|
- job.priority = 'BATCH'
|
|
|
- job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
+ return create_sql
|
|
|
+ def migrate_tables(self,**args):
|
|
|
+ """
|
|
|
+ This function will migrate a table from one location to another
|
|
|
+ The reason for migration is to be able to reduce a candidate table to only represent a patient by her quasi-identifiers.
|
|
|
+ @param dataset target dataset
|
|
|
+ """
|
|
|
+ o_dataset = args['dataset'] if 'dataset' in args else None
|
|
|
+ p = []
|
|
|
+ for table in self.tables:
|
|
|
+ sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",self.mytools.get_filtered_table(table,self.key),") as ",table['name']])
|
|
|
+ p.append(sql)
|
|
|
+ if o_dataset :
|
|
|
+ job = bq.QueryJobConfig()
|
|
|
+ job.destination = self.client.dataset(o_dataset).table(table['name'])
|
|
|
+ job.use_query_cache = True
|
|
|
+ job.allow_large_results = True
|
|
|
+ job.priority = 'INTERACTIVE'
|
|
|
+ job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
|
|
- r = client.query(create_sql,location='US',job_config=job)
|
|
|
+ r = self.client.query(sql,location='US',job_config=job)
|
|
|
+
|
|
|
+ print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
|
|
+ return p
|
|
|
+
|
|
|
+# if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
|
|
|
+
|
|
|
+# path = SYS_ARGS['path']
|
|
|
+# client = bq.Client.from_service_account_json(path)
|
|
|
+# i_dataset = SYS_ARGS['i_dataset']
|
|
|
+# key = SYS_ARGS['key']
|
|
|
+
|
|
|
+# mytools = utils(client = client)
|
|
|
+# tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
|
|
|
+# # print len(tables)
|
|
|
+# # tables = tables[:6]
|
|
|
+
|
|
|
+# if SYS_ARGS['action'] == 'create' :
|
|
|
+# #usage:
|
|
|
+# # create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
|
|
|
+# #
|
|
|
+# create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
|
|
|
+# o_dataset = SYS_ARGS['o_dataset']
|
|
|
+# table = SYS_ARGS['table']
|
|
|
+# if 'file' in SYS_ARGS :
|
|
|
+# f = open(table+'.sql','w')
|
|
|
+# f.write(create_sql)
|
|
|
+# f.close()
|
|
|
+# else:
|
|
|
+# job = bq.QueryJobConfig()
|
|
|
+# job.destination = client.dataset(o_dataset).table(table)
|
|
|
+# job.use_query_cache = True
|
|
|
+# job.allow_large_results = True
|
|
|
+# job.priority = 'BATCH'
|
|
|
+# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
+
|
|
|
+# r = client.query(create_sql,location='US',job_config=job)
|
|
|
|
|
|
- print [r.job_id,' ** ',r.state]
|
|
|
- elif SYS_ARGS['action'] == 'migrate' :
|
|
|
- #
|
|
|
- #
|
|
|
+# print [r.job_id,' ** ',r.state]
|
|
|
+# elif SYS_ARGS['action'] == 'migrate' :
|
|
|
+# #
|
|
|
+# #
|
|
|
|
|
|
- o_dataset = SYS_ARGS['o_dataset']
|
|
|
- for table in tables:
|
|
|
- sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
|
|
|
- print ""
|
|
|
- print sql
|
|
|
- print ""
|
|
|
- # job = bq.QueryJobConfig()
|
|
|
- # job.destination = client.dataset(o_dataset).table(table['name'])
|
|
|
- # job.use_query_cache = True
|
|
|
- # job.allow_large_results = True
|
|
|
- # job.priority = 'INTERACTIVE'
|
|
|
- # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
+# o_dataset = SYS_ARGS['o_dataset']
|
|
|
+# for table in tables:
|
|
|
+# sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
|
|
|
+# print ""
|
|
|
+# print sql
|
|
|
+# print ""
|
|
|
+# # job = bq.QueryJobConfig()
|
|
|
+# # job.destination = client.dataset(o_dataset).table(table['name'])
|
|
|
+# # job.use_query_cache = True
|
|
|
+# # job.allow_large_results = True
|
|
|
+# # job.priority = 'INTERACTIVE'
|
|
|
+# # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
|
|
|
|
|
- # r = client.query(sql,location='US',job_config=job)
|
|
|
+# # r = client.query(sql,location='US',job_config=job)
|
|
|
|
|
|
- # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
|
|
+# # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
|
|
|
|
|
|
|
|
- pass
|
|
|
- else:
|
|
|
- #
|
|
|
- #
|
|
|
- tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
|
|
- limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
|
|
- if tables :
|
|
|
- risk= risk()
|
|
|
- df = pd.DataFrame()
|
|
|
- dfs = pd.DataFrame()
|
|
|
- np.random.seed(1)
|
|
|
- for i in range(0,limit) :
|
|
|
- r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
|
|
- sql = r['sql']
|
|
|
- dfs = dfs.append(r['stream'],sort=True)
|
|
|
- df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
|
|
|
- # df = df.join(dfs,sort=True)
|
|
|
- df.to_csv(SYS_ARGS['table']+'.csv')
|
|
|
- # dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
|
|
- print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
|
|
- time.sleep(2)
|
|
|
+# pass
|
|
|
+# else:
|
|
|
+# #
|
|
|
+# #
|
|
|
+# tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
|
|
+# limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
|
|
+# if tables :
|
|
|
+# risk= risk()
|
|
|
+# df = pd.DataFrame()
|
|
|
+# dfs = pd.DataFrame()
|
|
|
+# np.random.seed(1)
|
|
|
+# for i in range(0,limit) :
|
|
|
+# r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
|
|
+# sql = r['sql']
|
|
|
+# dfs = dfs.append(r['stream'],sort=True)
|
|
|
+# df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
|
|
|
+# # df = df.join(dfs,sort=True)
|
|
|
+# df.to_csv(SYS_ARGS['table']+'.csv')
|
|
|
+# # dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
|
|
+# print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
|
|
+# time.sleep(2)
|
|
|
|
|
|
|
|
|
-else:
|
|
|
- print 'ERROR'
|
|
|
- pass
|
|
|
+# else:
|
|
|
+# print 'ERROR'
|
|
|
+# pass
|
|
|
|
|
|
-# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
|
|
|
-# tables = r.get_tables('raw','person_id')
|
|
|
-# sql = r.get_sql(tables=tables[:3],key='person_id')
|
|
|
-# #
|
|
|
-# # let's post this to a designated location
|
|
|
-# #
|
|
|
-# f = open('foo.sql','w')
|
|
|
-# f.write(sql)
|
|
|
-# f.close()
|
|
|
-# r.get_sql(tables=tables,key='person_id')
|
|
|
-# p = r.compute()
|
|
|
-# print p
|
|
|
-# p.to_csv("risk.csv")
|
|
|
-# r.write('foo.sql')
|
|
|
+# # r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
|
|
|
+# # tables = r.get_tables('raw','person_id')
|
|
|
+# # sql = r.get_sql(tables=tables[:3],key='person_id')
|
|
|
+# # #
|
|
|
+# # # let's post this to a designated location
|
|
|
+# # #
|
|
|
+# # f = open('foo.sql','w')
|
|
|
+# # f.write(sql)
|
|
|
+# # f.close()
|
|
|
+# # r.get_sql(tables=tables,key='person_id')
|
|
|
+# # p = r.compute()
|
|
|
+# # print p
|
|
|
+# # p.to_csv("risk.csv")
|
|
|
+# # r.write('foo.sql')
|