__init__.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. """
  2. (c) 2019 Healthcare/IO 1.0
  3. Vanderbilt University Medical Center, Health Information Privacy Laboratory
  4. https://hiplab.mc.vanderbilt.edu/healthcareio
  5. Authors:
  6. Khanhly Nguyen,
  7. Steve L. Nyemba<steve.l.nyemba@vanderbilt.edu>
  8. License:
  9. MIT, terms are available at https://opensource.org/licenses/MIT
  10. This parser was originally written by Khanhly Nguyen for her internship and is intended to parse x12 835,837 and others provided the appropriate configuration
  11. USAGE :
  12. - COMMAND LINE
  13. - EMBEDDED
  14. """
  15. import hashlib
  16. import json
  17. import os
  18. import sys
  19. from itertools import islice
  20. from multiprocessing import Process
  21. import transport
  22. import jsonmerge
  23. import copy
  24. class void :
  25. pass
  26. class Formatters :
  27. def __init__(self):
  28. # self.config = config
  29. self.get = void()
  30. self.get.config = self.get_config
  31. self.parse = void()
  32. self.parse.sv3 = self.sv3
  33. self.parse.sv2 = self.sv2
  34. self.sv2_parser = self.sv2
  35. self.sv3_parser = self.sv3
  36. self.sv3_parse = self.sv3
  37. self.format_proc = self.procedure
  38. self.format_diag = self.diagnosis
  39. self.parse.procedure = self.procedure
  40. self.parse.diagnosis = self.diagnosis
  41. self.parse.date = self.date
  42. self.format_date = self.date
  43. self.format_pos = self.pos
  44. self.format_time = self.time
  45. def split(self,row,sep='*',prefix='HI') :
  46. """
  47. This function is designed to split an x12 row and
  48. """
  49. value = []
  50. if row.startswith(prefix) is False:
  51. for row_value in row.replace('~','').split(sep) :
  52. if '>' in row_value and not row_value.startswith('HC'):
  53. # if row_value.startswith('HC') or row_value.startswith('AD'):
  54. if row_value.startswith('AD'):
  55. value += row_value.split('>')[:2]
  56. pass
  57. else:
  58. value += [row_value]
  59. # value += row_value.split('>') if row.startswith('CLM') is False else [row_value]
  60. else :
  61. value.append(row_value.replace('\n',''))
  62. value = [xchar.replace('\r','') for xchar in value] #row.replace('~','').split(sep)
  63. else:
  64. value = [ [prefix]+ self.split(item,'>') for item in row.replace('~','').split(sep)[1:] ]
  65. return value if type(value) == list and type(value[0]) != list else value[0]
  66. def get_config(self,config,row):
  67. """
  68. This function will return the meaningfull parts of the configuration for a given item
  69. """
  70. _row = list(row) if type(row[0]) == str else list(row[0])
  71. _info = config[_row[0]] if _row[0] in config else {}
  72. _rinfo = {}
  73. key = None
  74. if '@ref' in _info:
  75. keys = list(set(_row) & set(_info['@ref'].keys()))
  76. if keys :
  77. _rinfo = {}
  78. for key in keys :
  79. _rinfo = jsonmerge.merge(_rinfo,_info['@ref'][key])
  80. return _rinfo
  81. # key = key[0]
  82. # return _info['@ref'][key]
  83. else:
  84. return {}
  85. if not _info and 'SIMILAR' in config:
  86. #
  87. # Let's look for the nearest key using the edit distance
  88. if _row[0] in config['SIMILAR'] :
  89. key = config['SIMILAR'][_row[0]]
  90. _info = config[key]
  91. return _info
  92. def hash(self,value):
  93. salt = os.environ['HEALTHCAREIO_SALT'] if 'HEALTHCAREIO_SALT' in os.environ else ''
  94. _value = str(value)+ salt
  95. if sys.version_info[0] > 2 :
  96. return hashlib.md5(_value.encode('utf-8')).hexdigest()
  97. else:
  98. return hashlib.md5(_value).hexdigest()
  99. def suppress (self,value):
  100. return 'N/A'
  101. def date(self,value):
  102. value = value if type(value) != list else "-".join(value)
  103. if len(value) > 8 or '-' in value:
  104. #
  105. # This is the case of a thru date i.e the first part should be provided in a 435 entry
  106. #
  107. fdate = "-".join([value[:8][:4],value[:8][4:6],value[:8][6:8]])
  108. tdate = "-".join([value[9:][:4],value[9:][4:6],value[9:][6:8]])
  109. return {"from":fdate,"to":tdate}
  110. if len(value) == 8 :
  111. year = value[:4]
  112. month = value[4:6]
  113. day = value[6:]
  114. return "-".join([year,month,day])[:10] #{"year":year,"month":month,"day":day}
  115. elif len(value) == 6 :
  116. year = '20' + value[:2]
  117. month = value[2:4]
  118. day = value[4:]
  119. elif value.isnumeric() and len(value) >= 10:
  120. #
  121. # Here I a will assume we have a numeric vale
  122. year = value[:4]
  123. month= value[4:6]
  124. day = value[6:8]
  125. else:
  126. #
  127. # We have a date formatting issue
  128. return value
  129. return "-".join([year,month,day])
  130. def time(self,value):
  131. pass
  132. def sv3(self,value):
  133. if '>' in value [1]:
  134. terms = value[1].split('>')
  135. return {'type':terms[0],'code':terms[1],"amount":float(value[2])}
  136. else:
  137. return {"code":value[2],"type":value[1],"amount":float(value[3])}
  138. def sv2(self,value):
  139. #
  140. # @TODO: Sometimes there's a suffix (need to inventory all the variations)
  141. #
  142. if '>' in value or ':' in value:
  143. xchar = '>' if '>' in value else ':'
  144. _values = value.split(xchar)
  145. modifier = {}
  146. if len(_values) > 2 :
  147. modifier= {"code":_values[2]}
  148. if len(_values) > 3 :
  149. modifier['type'] = _values[3]
  150. _value = {"code":_values[1],"type":_values[0]}
  151. if modifier :
  152. _value['modifier'] = modifier
  153. return _value
  154. else:
  155. return value
  156. def procedure(self,value):
  157. for xchar in [':','<','|','>'] :
  158. if xchar in value and len(value.split(xchar)) > 1 :
  159. #_value = {"type":value.split(':')[0].strip(),"code":value.split(':')[1].strip()}
  160. _value = {"type":value.split(xchar)[0].strip(),"code":value.split(xchar)[1].strip()}
  161. if len(value.split(xchar)) >2 :
  162. index = 1;
  163. for modifier in value.split(xchar)[2:] :
  164. _value['modifier_'+str(index)] = modifier
  165. index += 1
  166. break
  167. else:
  168. _value = str(value)
  169. return _value
  170. def diagnosis(self,value):
  171. return [ {"code":item[2], "type":item[1]} for item in value if len(item) > 1]
  172. def pos(self,value):
  173. """
  174. formatting place of service information within a segment (REF)
  175. @TODO: In order to accomodate the other elements they need to be specified in the configuration
  176. Otherwise it causes problems on export
  177. """
  178. xchar = '>' if '>' in value else ':'
  179. x = value.split(xchar)
  180. x = {"place_of_service":x[0],"indicator":x[1],"frequency":x[2]} if len(x) == 3 else {"place_of_service":x[0],"indicator":None,"frequency":None}
  181. return x
  182. class Parser (Process):
  183. def __init__(self,path):
  184. """
  185. :path path of the configuration file (it can be absolute)
  186. """
  187. Process.__init__(self)
  188. self.utils = Formatters()
  189. self.get = void()
  190. self.get.value = self.get_map
  191. self.get.default_value = self.get_default_value
  192. _config = json.loads(open(path).read())
  193. self._custom_config = self.get_custom(path)
  194. self.config = _config['parser']
  195. self.store = _config['store']
  196. self.cache = {}
  197. self.files = []
  198. self.set = void()
  199. self.set.files = self.set_files
  200. self.emit = void()
  201. self.emit.pre = None
  202. self.emit.post = None
  203. def get_custom(self,path) :
  204. """
  205. :path path of the configuration file (it can be absolute)
  206. """
  207. #
  208. #
  209. _path = path.replace('config.json','')
  210. if _path.endswith(os.sep) :
  211. _path = _path[:-1]
  212. _config = {}
  213. _path = os.sep.join([_path,'custom'])
  214. if os.path.exists(_path) :
  215. files = os.listdir(_path)
  216. if files :
  217. fullname = os.sep.join([_path,files[0]])
  218. _config = json.loads ( (open(fullname)).read() )
  219. return _config
  220. def set_files(self,files):
  221. self.files = files
  222. def get_map(self,row,config,version=None):
  223. # label = config['label'] if 'label' in config else None
  224. handler = Formatters()
  225. if 'map' not in config and hasattr(handler,config['apply']):
  226. pointer = getattr(handler,config['apply'])
  227. object_value = pointer(row)
  228. return object_value
  229. #
  230. # Pull the goto configuration that skips rows
  231. #
  232. omap = config['map'] if not version or version not in config else config[version]
  233. anchors = config['anchors'] if 'anchors' in config else []
  234. rewrite = config['rewrite'] if 'rewrite' in config else {}
  235. if type(row[0]) == str:
  236. object_value = {}
  237. for key in omap :
  238. index = omap[key]
  239. if anchors and set(anchors) & set(row):
  240. _key = list(set(anchors) & set(row))[0]
  241. aindex = row.index(_key)
  242. index = aindex + index
  243. if index < len(row) :
  244. value = row[index]
  245. if 'cast' in config and key in config['cast'] and value.strip() != '' :
  246. if config['cast'][key] in ['float','int']:
  247. try:
  248. value = eval(config['cast'][key])(value)
  249. except Exception as e:
  250. pass
  251. #
  252. # Sometimes shit hits the fan when the anchor is missing
  253. # This is typical but using the hardened function helps circumvent this (SV2,SV3)
  254. #
  255. elif hasattr(handler,config['cast'][key]):
  256. pointer = getattr(handler,config['cast'][key])
  257. value = pointer(value)
  258. else:
  259. print ("Missing Pointer ",key,config['cast'])
  260. if type(value) == dict :
  261. for objkey in value :
  262. if type(value[objkey]) == dict :
  263. continue
  264. if 'syn' in config and value[objkey] in config['syn'] :
  265. # value[objkey] = config['syn'][ value[objkey]]
  266. pass
  267. if key in rewrite :
  268. _key = rewrite[key]
  269. if _key in value :
  270. value = value[_key]
  271. else:
  272. value = ""
  273. value = {key:value} if key not in value else value
  274. else:
  275. if 'syn' in config and value in config['syn'] :
  276. # value = config['syn'][value]
  277. pass
  278. if type(value) == dict :
  279. # object_value = dict(object_value, **value)
  280. object_value = jsonmerge.merge(object_value, value)
  281. else:
  282. object_value[key] = value
  283. else:
  284. #
  285. # we are dealing with a complex object
  286. object_value = []
  287. for row_item in row :
  288. value = self.get.value(row_item,config,version)
  289. object_value.append(value)
  290. #
  291. # We need to add the index of the object it matters in determining the claim types
  292. #
  293. # object_value.append( list(get_map(row_item,config,version)))
  294. # object_value = {label:object_value}
  295. return object_value
  296. def set_cache(self,tmp,_info) :
  297. """
  298. insert into cache a value that the, these are in reference to a loop
  299. """
  300. if 'cache' in _info :
  301. key = _info['cache']['key']
  302. value=_info['cache']['value']
  303. field = _info['cache']['field']
  304. if value in tmp :
  305. self.cache [key] = {field:tmp[value]}
  306. pass
  307. def get_cache(self,row) :
  308. """
  309. retrieve cache element for a current
  310. """
  311. key = row[0]
  312. return self.cache[key] if key in self.cache else {}
  313. def apply(self,content,_code) :
  314. """
  315. :content content of a file i.e a segment with the envelope
  316. :_code 837 or 835 (helps get the appropriate configuration)
  317. """
  318. util = Formatters()
  319. # header = default_value.copy()
  320. value = {}
  321. for row in content[:] :
  322. row = util.split(row.replace('\n','').replace('~',''))
  323. _info = util.get.config(self.config[_code][0],row)
  324. if self._custom_config and _code in self._custom_config:
  325. _cinfo = util.get.config(self._custom_config[_code],row)
  326. else:
  327. _cinfo = {}
  328. if _info or _cinfo:
  329. try:
  330. _info = jsonmerge.merge(_info,_cinfo)
  331. tmp = self.get.value(row,_info)
  332. if not tmp :
  333. continue
  334. #
  335. # At this point we have the configuration and the row parsed into values
  336. # We should check to see if we don't have anything in the cache to be added to it
  337. #
  338. if row[0] in self.cache :
  339. tmp = jsonmerge.merge(tmp,self.get_cache(row))
  340. if 'label' in _info :
  341. label = _info['label']
  342. if type(tmp) == list :
  343. value[label] = tmp if label not in value else value[label] + tmp
  344. else:
  345. # if 'DTM' in row :
  346. # print ([label,tmp,label in value])
  347. if label not in value :
  348. value[label] = []
  349. value[label].append(tmp)
  350. # if label not in value:
  351. # value[label] = [tmp]
  352. # else:
  353. # value[label].append(tmp)
  354. if '_index' not in tmp :
  355. #
  356. # In case we asked it to be overriden, then this will not apply
  357. # X12 occasionally requires references to other elements in a loop (alas)
  358. #
  359. tmp['_index'] = len(value[label]) -1
  360. elif 'field' in _info :
  361. name = _info['field']
  362. # value[name] = tmp
  363. # value = jsonmerge.merge(value,{name:tmp})
  364. value = dict(value,**{name:tmp})
  365. else:
  366. value = dict(value,**tmp)
  367. pass
  368. except Exception as e :
  369. print (e.args[0])
  370. # print ('__',(dir(e.args)))
  371. pass
  372. #
  373. # At this point the object is completely built,
  374. # if there ar any attributes to be cached it will be done here
  375. #
  376. if 'cache' in _info :
  377. self.set_cache(tmp,_info)
  378. return value if value else {}
  379. def get_default_value(self,content,_code):
  380. util = Formatters()
  381. TOP_ROW = content[1].split('*')
  382. SUBMITTED_DATE = util.parse.date(TOP_ROW[4])
  383. CATEGORY= content[2].split('*')[1].strip()
  384. VERSION = content[1].split('*')[-1].replace('~','').replace('\n','')
  385. SENDER_ID = TOP_ROW[2]
  386. row = util.split(content[3])
  387. _info = util.get_config(self.config[_code][0],row)
  388. value = self.get.value(row,_info,VERSION) if _info else {}
  389. value['category'] = {"setid": _code,"version":'X'+VERSION.split('X')[1],"id":VERSION.split('X')[0].strip()}
  390. value["submitted"] = SUBMITTED_DATE
  391. value['sender_id'] = SENDER_ID
  392. value = dict(value,**self.apply(content,_code))
  393. # Let's parse this for default values
  394. return value #jsonmerge.merge(value,self.apply(content,_code))
  395. def read(self,filename) :
  396. """
  397. :formerly get_content
  398. This function returns the of the EDI file parsed given the configuration specified. it is capable of identifying a file given the content
  399. :section loop prefix (HL, CLP)
  400. :config configuration with formatting rules, labels ...
  401. :filename location of the file
  402. """
  403. # section = section if section else config['SECTION']
  404. logs = []
  405. claims = []
  406. _code = 'UNKNOWN'
  407. try:
  408. self.cache = {}
  409. file = open(filename.strip())
  410. file = file.read().split('CLP')
  411. _code = '835'
  412. section = 'CLP'
  413. if len(file) == 1 :
  414. file = file[0].split('CLM') #.split('HL')
  415. _code = '837'
  416. section = 'CLM' #'HL'
  417. INITIAL_ROWS = file[0].split(section)[0].split('\n')
  418. if len(INITIAL_ROWS) == 1 :
  419. INITIAL_ROWS = INITIAL_ROWS[0].split('~')
  420. # for item in file[1:] :
  421. # item = item.replace('~','\n')
  422. # print (INITIAL_ROWS)
  423. DEFAULT_VALUE = self.get.default_value(INITIAL_ROWS,_code)
  424. DEFAULT_VALUE['name'] = filename.strip()
  425. file = section.join(file).split('\n')
  426. if len(file) == 1:
  427. file = file[0].split('~')
  428. #
  429. # In the initial rows, there's redundant information (so much for x12 standard)
  430. # index 1 identifies file type i.e CLM for claim and CLP for remittance
  431. segment = []
  432. index = 0;
  433. _toprows = []
  434. _default = None
  435. for row in file :
  436. row = row.replace('\r','')
  437. # if not segment and not row.startswith(section):
  438. # _toprows += [row]
  439. if row.startswith(section) and not segment:
  440. segment = [row]
  441. continue
  442. elif segment and not row.startswith(section):
  443. segment.append(row)
  444. if len(segment) > 1 and row.startswith(section):
  445. #
  446. # process the segment somewhere (create a thread maybe?)
  447. #
  448. _claim = self.apply(segment,_code)
  449. if _claim :
  450. _claim['index'] = index #len(claims)
  451. # claims.append(dict(DEFAULT_VALUE,**_claim))
  452. #
  453. # schema = [ {key:{"mergeStrategy":"append" if list( type(_claim[key])) else "overwrite"}} for key in _claim.keys()] # if type(_claim[key]) == list]
  454. # _schema = set(DEFAULT_VALUE.keys()) - schema
  455. # if schema :
  456. # schema = {"properties":dict.fromkeys(schema,{"mergeStrategy":"append"})}
  457. # else:
  458. # schema = {"properties":{}}
  459. # schema = jsonmerge.merge(schema['properties'],dict.fromkeys(_schema,{"mergeStrategy":"overwrite"}))
  460. schema = {"properties":{}}
  461. for attr in _claim.keys() :
  462. schema['properties'][attr] = {"mergeStrategy": "append" if type(_claim[attr]) == list else "overwrite" }
  463. merger = jsonmerge.Merger(schema)
  464. _baseclaim = None
  465. _baseclaim = merger.merge(_baseclaim,copy.deepcopy(DEFAULT_VALUE))
  466. _claim = merger.merge(_baseclaim,_claim)
  467. # _claim = merger.merge(DEFAULT_VALUE.copy(),_claim)
  468. claims.append( _claim)
  469. segment = [row]
  470. index += 1
  471. pass
  472. #
  473. # Handling the last claim found
  474. if segment and segment[0].startswith(section) :
  475. # default_claim = dict({"name":index},**DEFAULT_VALUE)
  476. claim = self.apply(segment,_code)
  477. if claim :
  478. claim['index'] = len(claims)
  479. # schema = [key for key in claim.keys() if type(claim[key]) == list]
  480. # if schema :
  481. # schema = {"properties":dict.fromkeys(schema,{"mergeStrategy":"append"})}
  482. # else:
  483. # print (claim.keys())
  484. # schema = {}
  485. #
  486. # @TODO: Fix merger related to schema (drops certain fields ... NOT cool)
  487. # merger = jsonmerge.Merger(schema)
  488. # top_row_claim = self.apply(_toprows,_code)
  489. # claim = merger.merge(claim,self.apply(_toprows,_code))
  490. # claims.append(dict(DEFAULT_VALUE,**claim))
  491. schema = {"properties":{}}
  492. for attr in claim.keys() :
  493. schema['properties'][attr] = {"mergeStrategy": "append" if type(claim[attr]) == list else "overwrite" }
  494. merger = jsonmerge.Merger(schema)
  495. _baseclaim = None
  496. _baseclaim = merger.merge(_baseclaim,copy.deepcopy(DEFAULT_VALUE))
  497. claim = merger.merge(_baseclaim,claim)
  498. claims.append(claim)
  499. # claims.append(merger.merge(DEFAULT_VALUE.copy(),claim))
  500. if type(file) != list :
  501. file.close()
  502. # x12_file = open(filename.strip(),errors='ignore').read().split('\n')
  503. except Exception as e:
  504. logs.append ({"parse":_code,"completed":False,"name":filename,"msg":e.args[0]})
  505. return [],logs,None
  506. rate = 0 if len(claims) == 0 else (1 + index)/len(claims)
  507. logs.append ({"parse":"claims" if _code == '837' else 'remits',"completed":True,"name":filename,"rate":rate})
  508. # self.finish(claims,logs,_code)
  509. return claims,logs,_code
  510. def run(self):
  511. if self.emit.pre :
  512. self.emit.pre()
  513. for filename in self.files :
  514. content,logs,_code = self.read(filename)
  515. self.finish(content,logs,_code)
  516. def finish(self,content,logs,_code) :
  517. args = self.store
  518. _args = json.loads(json.dumps(self.store))
  519. if args['type'] == 'mongo.MongoWriter' :
  520. args['args']['doc'] = 'claims' if _code == '837' else 'remits'
  521. _args['args']['doc'] = 'logs'
  522. else:
  523. args['args']['table'] = 'claims' if _code == '837' else 'remits'
  524. _args['args']['table'] = 'logs'
  525. if content :
  526. writer = transport.factory.instance(**args)
  527. writer.write(content)
  528. writer.close()
  529. if logs :
  530. logger = transport.factory.instance(**_args)
  531. logger.write(logs)
  532. logger.close()
  533. if self.emit.post :
  534. self.emit.post(content,logs)