|
@@ -65,9 +65,36 @@ else:
|
|
|
import mongo
|
|
|
import s3
|
|
|
import sql
|
|
|
+import psycopg2 as pg
|
|
|
+import mysql.connector as my
|
|
|
+from google.cloud import bigquery as bq
|
|
|
+import nzpy as nz #--- netezza drivers
|
|
|
+import os
|
|
|
|
|
|
-
|
|
|
+RDBMS = {
|
|
|
+
|
|
|
+ "postgresql":{"port":"5432","driver":pg},
|
|
|
+ "redshift":{"port":"5432","driver":pg},
|
|
|
+ "netezza":{"port":"5480","driver":nz},
|
|
|
+ "mysql":{"port":"3306","driver":my},
|
|
|
+ "mariadb":{"port":"3306","driver":my},
|
|
|
+ "mongodb":{"port":"27017","class":{"read"}},
|
|
|
+ "couchdb":{"port":"5984"}
|
|
|
+}
|
|
|
class factory :
|
|
|
+ TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}}
|
|
|
+ PROVIDERS = {
|
|
|
+ "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}},
|
|
|
+ "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}},
|
|
|
+ "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}},
|
|
|
+ "redshift":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}},
|
|
|
+ "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}},
|
|
|
+ "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}},
|
|
|
+ "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}},
|
|
|
+ "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}},
|
|
|
+ "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}},
|
|
|
+ "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}}
|
|
|
+
|
|
|
@staticmethod
|
|
|
def instance(**args):
|
|
|
"""
|
|
@@ -99,131 +126,30 @@ class factory :
|
|
|
return anObject
|
|
|
|
|
|
import time
|
|
|
-
|
|
|
-
|
|
|
-# class Reader:
|
|
|
-# def __init__(self):
|
|
|
-# self.nrows = 0
|
|
|
-# self.xchar = None
|
|
|
-
|
|
|
-# def row_count(self):
|
|
|
-# content = self.read()
|
|
|
-# return np.sum([1 for row in content])
|
|
|
-# def delimiter(self,sample):
|
|
|
-# """
|
|
|
-# This function determines the most common delimiter from a subset of possible delimiters.
|
|
|
-# It uses a statistical approach (distribution) to guage the distribution of columns for a given delimiter
|
|
|
-
|
|
|
-# :sample sample string/content expecting matrix i.e list of rows
|
|
|
-# """
|
|
|
-
|
|
|
-# m = {',':[],'\t':[],'|':[],'\x3A':[]}
|
|
|
-# delim = m.keys()
|
|
|
-# for row in sample:
|
|
|
-# for xchar in delim:
|
|
|
-# if row.split(xchar) > 1:
|
|
|
-# m[xchar].append(len(row.split(xchar)))
|
|
|
-# else:
|
|
|
-# m[xchar].append(0)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-# #
|
|
|
-# # The delimiter with the smallest variance, provided the mean is greater than 1
|
|
|
-# # This would be troublesome if there many broken records sampled
|
|
|
-# #
|
|
|
-# m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1}
|
|
|
-# index = m.values().index( min(m.values()))
|
|
|
-# xchar = m.keys()[index]
|
|
|
-
|
|
|
-# return xchar
|
|
|
-# def col_count(self,sample):
|
|
|
-# """
|
|
|
-# This function retirms the number of columns of a given sample
|
|
|
-# @pre self.xchar is not None
|
|
|
-# """
|
|
|
-
|
|
|
-# m = {}
|
|
|
-# i = 0
|
|
|
-
|
|
|
-# for row in sample:
|
|
|
-# row = self.format(row)
|
|
|
-# id = str(len(row))
|
|
|
-# #id = str(len(row.split(self.xchar)))
|
|
|
-
|
|
|
-# if id not in m:
|
|
|
-# m[id] = 0
|
|
|
-# m[id] = m[id] + 1
|
|
|
-
|
|
|
-# index = m.values().index( max(m.values()) )
|
|
|
-# ncols = int(m.keys()[index])
|
|
|
+def instance(provider,context,**_args):
|
|
|
+ """
|
|
|
+
|
|
|
+ @param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...}
|
|
|
+ @param context read|write|rw
|
|
|
+ @param _args argument to got with the datastore (username,password,host,port ...)
|
|
|
+ """
|
|
|
+ _id = context if context in ['read','write'] else None
|
|
|
+ if _id :
|
|
|
+ args = {'provider':_id}
|
|
|
+ for key in factory.PROVIDERS[provider] :
|
|
|
+ if key == 'class' :
|
|
|
+ continue
|
|
|
+ value = factory.PROVIDERS[provider][key]
|
|
|
+ args[key] = value
|
|
|
+ #
|
|
|
+ #
|
|
|
+ args = dict(args,**_args)
|
|
|
|
|
|
-
|
|
|
-# return ncols;
|
|
|
-# def format (self,row):
|
|
|
-# """
|
|
|
-# This function will clean records of a given row by removing non-ascii characters
|
|
|
-# @pre self.xchar is not None
|
|
|
-# """
|
|
|
-
|
|
|
-# if isinstance(row,list) == False:
|
|
|
-# #
|
|
|
-# # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary)
|
|
|
-# cols = self.split(row)
|
|
|
-# #cols = row.split(self.xchar)
|
|
|
-# else:
|
|
|
-# cols = row ;
|
|
|
-# return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols]
|
|
|
-
|
|
|
-# def split (self,row):
|
|
|
-# """
|
|
|
-# This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes.
|
|
|
-# @pre : self.xchar is not None
|
|
|
-# """
|
|
|
-
|
|
|
-# pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"])
|
|
|
-# return re.findall(pattern,row.replace('\n',''))
|
|
|
-
|
|
|
-
|
|
|
-# class Writer:
|
|
|
-
|
|
|
-# def format(self,row,xchar):
|
|
|
-# if xchar is not None and isinstance(row,list):
|
|
|
-# return xchar.join(row)+'\n'
|
|
|
-# elif xchar is None and isinstance(row,dict):
|
|
|
-# row = json.dumps(row)
|
|
|
-# return row
|
|
|
-# """
|
|
|
-# It is important to be able to archive data so as to insure that growth is controlled
|
|
|
-# Nothing in nature grows indefinitely neither should data being handled.
|
|
|
-# """
|
|
|
-# def archive(self):
|
|
|
-# pass
|
|
|
-# def flush(self):
|
|
|
-# pass
|
|
|
-
|
|
|
-# class factory :
|
|
|
-# @staticmethod
|
|
|
-# def instance(**args):
|
|
|
-
|
|
|
-# source = args['type']
|
|
|
-# params = args['args']
|
|
|
-# anObject = None
|
|
|
-
|
|
|
-# if source in ['HttpRequestReader','HttpSessionWriter']:
|
|
|
-# #
|
|
|
-# # @TODO: Make sure objects are serializable, be smart about them !!
|
|
|
-# #
|
|
|
-# aClassName = ''.join([source,'(**params)'])
|
|
|
-
|
|
|
+ # print (provider in factory.PROVIDERS)
|
|
|
+ if 'class' in factory.PROVIDERS[provider]:
|
|
|
+ pointer = factory.PROVIDERS[provider]['class'][_id]
|
|
|
+ else:
|
|
|
+ pointer = sql.SQLReader if _id == 'read' else sql.SQLWriter
|
|
|
+ return pointer(**args)
|
|
|
|
|
|
-# else:
|
|
|
-
|
|
|
-# stream = json.dumps(params)
|
|
|
-# aClassName = ''.join([source,'(**',stream,')'])
|
|
|
-# try:
|
|
|
-# anObject = eval( aClassName)
|
|
|
-# #setattr(anObject,'name',source)
|
|
|
-# except Exception,e:
|
|
|
-# print ['Error ',e]
|
|
|
-# return anObject
|
|
|
+ return None
|