code
/
parser
mirror of https://hiplab.mc.vanderbilt.edu/git/hiplab/parser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
							"""
    (c) 2019 Healthcare/IO 1.0
    Vanderbilt University Medical Center, Health Information Privacy Laboratory
    https://hiplab.mc.vanderbilt.edu/healthcareio


    Authors:
        Khanhly Nguyen, 
        Steve L. Nyemba<steve.l.nyemba@vanderbilt.edu>

    License:
        MIT, terms are available at https://opensource.org/licenses/MIT

    This parser was originally written by Khanhly Nguyen for her internship and is intended to parse x12 835,837 and others provided the appropriate configuration
    USAGE :
        - COMMAND LINE
        
        - EMBEDDED
"""
import hashlib
import json
import os
import sys
from itertools import islice
from multiprocessing import Process
import transport
import jsonmerge
class void :
    pass
class Formatters :
    def __init__(self):
        # self.config = config
        self.get = void()
        self.get.config = self.get_config
        
        self.parse = void()
        self.parse.sv3 = self.sv3
        self.parse.sv2 = self.sv2
        self.sv2_parse = self.sv2
        self.sv3_parse = self.sv3
        self.format_proc = self.procedure
        self.format_diag = self.diagnosis
        self.parse.procedure = self.procedure
        self.parse.diagnosis = self.diagnosis
        self.parse.date     = self.date
        self.format_date = self.date
        self.format_pos = self.pos
        self.format_time = self.time
    def split(self,row,sep='*',prefix='HI') :
        """
        This function is designed to split an x12 row and 
        """
        value = []
        if row.startswith(prefix) is False:
            
            
            for row_value in row.replace('~','').split(sep) :
                
                if '>' in row_value :
                    if row_value.startswith('HC') or row_value.startswith('AD'):
                    
                        value += row_value.split('>')[:2] 
                    else:
                        
                        value += row_value.split('>')   if row.startswith('CLM') is False else [row_value]
                        
                else :
                    
                    value.append(row_value.replace('\n',''))
            value =  [xchar.replace('\r','') for xchar in value] #row.replace('~','').split(sep)
        else:
            
            value =  [ [prefix]+ self.split(item,'>') for item in row.replace('~','').split(sep)[1:] ]
        
        return value if type(value) == list and type(value[0]) != list else value[0]
    def get_config(self,config,row):
        """
        This function will return the meaningfull parts of the configuration for a given item
        """
        
        _row = list(row) if type(row[0]) == str else list(row[0])
        
        _info = config[_row[0]] if _row[0] in config else {}
        key = None
        if '@ref' in _info:
            key = list(set(_row) & set(_info['@ref'].keys()))
            if key :
                key  = key[0]
                return _info['@ref'][key]
            else:
                return {}
            
        if not _info and 'SIMILAR' in config:
            #
            # Let's look for the nearest key using the edit distance
            if _row[0] in config['SIMILAR']    :
                key = config['SIMILAR'][_row[0]]
                _info = config[key]
        return _info
    
    def hash(self,value):
        salt = os.environ['HEALTHCAREIO_SALT'] if 'HEALTHCAREIO_SALT' in os.environ else ''
        _value = str(value)+ salt
        if sys.version_info[0] > 2 :
            return hashlib.md5(_value.encode('utf-8')).hexdigest()
        else:
            return hashlib.md5(_value).hexdigest()

    def suppress (self,value):
        return 'N/A'
    def date(self,value):
        if len(value) > 8 or '-' in value:
            value = value.split('-')[0]

        if len(value) == 8 :
            year = value[:4]
            month = value[4:6]
            day = value[6:]
            return "-".join([year,month,day])[:10] #{"year":year,"month":month,"day":day}
        elif len(value) == 6 :
            year = '20' + value[:2]
            month = value[2:4]
            day   = value[4:]
        
            #
            # We have a date formatting issue
           
            return "-".join([year,month,day])
    def time(self,value):
        pass
    def sv3(self,value):
        if '>' in value [1]:
            terms = value[1].split('>')
            return {'type':terms[0],'code':terms[1],"amount":float(value[2])}
        else:
           
            return {"code":value[2],"type":value[1],"amount":float(value[3])}
    def sv2(self,value):
        #
        # @TODO: Sometimes there's a suffix (need to inventory all the variations)
        #
        if '>' in value or ':' in value:
            xchar = '>' if '>' in value else ':'
            _values = value.split(xchar)
            modifier = {}
            
            if len(_values) > 2 :

                modifier= {"code":_values[2]}
                if len(_values) > 3 :
                    modifier['type'] = _values[3]
            _value = {"code":_values[1],"type":_values[0]}
            if modifier :
                _value['modifier'] = modifier

            return _value
        else:
            return value
    
        
    def procedure(self,value):
        for xchar in [':','<'] :
            if xchar in value and len(value.split(xchar)) > 1 :
                #_value = {"type":value.split(':')[0].strip(),"code":value.split(':')[1].strip()}
                _value = {"type":value.split(xchar)[0].strip(),"code":value.split(xchar)[1].strip()}
                break
            else:
                _value = str(value)
        return _value
    def diagnosis(self,alue):

        return [ {"code":item[2], "type":item[1]} for item in value if len(item) > 1]
    def pos(self,value):
        """
            formatting place of service information within a segment (REF)
            @TODO: In order to accomodate the other elements they need to be specified in the configuration
                Otherwise it causes problems on export
        """
        
        xchar = '>' if '>' in value else ':'
        x = value.split(xchar)    
        x =  {"code":x[0],"indicator":x[1],"frequency":x[2]} if len(x) == 3 else {"code":x[0],"indicator":None,"frequency":None}
        return x['code']
class Parser (Process):
    def __init__(self,path):
        """
            :path       path of the configuration file (it can be absolute)
        """
        Process.__init__(self)
        self.utils  = Formatters()
        self.get    = void()
        self.get.value = self.get_map
        self.get.default_value = self.get_default_value
        _config = json.loads(open(path).read())
        self._custom_config = self.get_custom(path)
        self.config = _config['parser']
        self.store  = _config['store']        
        
        self.files = []
        self.set = void()
        self.set.files = self.set_files
        self.emit = void()
        self.emit.pre =  None
        self.emit.post = None
    def get_custom(self,path) :
        """
        :path   path of the configuration file (it can be absolute)
        """
        #
        #
        _path = path.replace('config.json','')
        if _path.endswith(os.sep) :
            _path = _path[:-1]
        
        _config = {}
        _path = os.sep.join([_path,'custom'])
        if os.path.exists(_path) :
            
            files = os.listdir(_path)
            if files :
                fullname = os.sep.join([_path,files[0]])
                
                _config = json.loads ( (open(fullname)).read() )
        return _config

    def set_files(self,files):
        self.files = files
    def get_map(self,row,config,version=None):
        
        # label = config['label'] if 'label' in config else None    
        handler = Formatters()
        if 'map' not in config and hasattr(handler,config['apply']):
            
            pointer = getattr(handler,config['apply'])
            object_value = pointer(row)            
            return object_value

        omap = config['map'] if not version or version not in config else config[version]
        anchors = config['anchors'] if 'anchors' in config else []
        
        if type(row[0]) == str:        
            object_value = {}
            for key in omap :
                
                index = omap[key]
                if anchors and set(anchors) & set(row):
                    _key = list(set(anchors) & set(row))[0]
                    
                    aindex = row.index(_key)
                    index = aindex +  index

                if index < len(row) :
                    value = row[index] 
                    
                    if 'cast' in config and key in config['cast'] and value.strip() != '' :
                        if config['cast'][key] in ['float','int'] :
                            value = eval(config['cast'][key])(value)
                        elif hasattr(handler,config['cast'][key]):
                            pointer = getattr(handler,config['cast'][key])
                            value   = pointer(value)
                        else:
                            print ("Missing Pointer ",config['cast'][key])

                        # print (key,value)
                        
                    if type(value) == dict :
                        for objkey in value :
                            
                            if type(value[objkey]) == dict :
                                continue 
                            if 'syn' in config and value[objkey] in config['syn'] :
                                value[objkey] = config['syn'][ value[objkey]]
                        
                        value = {key:value} if key not  in value else value
                        
                        
                    else:
                        if 'syn' in config and value in config['syn'] :
                            value = config['syn'][value]
                    if type(value) == dict :
                        
                        object_value = dict(object_value, **value) 
                        
                    else:
                        
                        object_value[key] = value
                        
        else:
            #
            # we are dealing with a complex object
            object_value = []
            
            for row_item in row :
                
                value = self.get.value(row_item,config,version)            
                object_value.append(value)
                #
                # We need to add the index of the object it matters in determining the claim types
                #
                
                # object_value.append( list(get_map(row_item,config,version)))
            # object_value = {label:object_value}
        
        return object_value
    def apply(self,content,_code) :
        """
        :content    content of a file i.e a segment with the envelope
        :_code  837 or 835 (helps get the appropriate configuration)
        """
        util   = Formatters()
        # header       = default_value.copy()
        value = {}
        
        for row in content[:] :
            
            
            row     = util.split(row.replace('\n','').replace('~',''))
            _info   = util.get.config(self.config[_code][0],row)
            if self._custom_config and _code in self._custom_config:
                _cinfo   = util.get.config(self._custom_config[_code],row)
            else:
                _cinfo = {}
            # _info   = self.consolidate(row=row,type=_code,config=_info,util=util)
            # print ([row[0],_info])
            # print ()
            # continue
            # _cinfo   = util.get.config(self._custom_config[_code],row)
            
            
            if _info :

                try:
                    _info = jsonmerge.merge(_info,_cinfo)
                    tmp = self.get.value(row,_info)
                    
                    if not tmp :
                        continue 
                    if 'label' in _info :
                        label = _info['label']

                        if type(tmp) == list :
                            
                            value[label] = tmp if label not in value else value[label] + tmp
                            
                        else:
                            if label not in value:   
                                value[label] = [tmp]
                            # elif len(list(tmp.keys())) == 1 :
                            #     # print "\t",len(claim[label]),tmp
                            #     index = len(value[label]) -1 
                            #     value[label][index] = dict(value[label][index],**tmp)
                            else:
                                value[label].append(tmp)                        
                        tmp['_index'] = len(value[label]) -1 

                        # if len(value[label]) > 0 :                    
                        #     labels = []
                        #     for item in value[label] :
                        #         item['_index'] = len(labels)
                        #         if item not in labels :
                                    
                        #             labels.append(item)
                        #     value[label] = labels
                    elif 'field' in _info :
                        
                        name = _info['field']
                        # value[name] = tmp
                        value = jsonmerge.merge(value,{name:tmp})
                      
                    else:
                        

                        value = dict(value,**tmp)
                    
                    pass
                except Exception as e :
                    
                    print ('__',e.args)
                    pass
                
        return value if value else {}

    def get_default_value(self,content,_code):
        
        util = Formatters()
        TOP_ROW = content[1].split('*')
        CATEGORY= content[2].split('*')[1].strip()
        VERSION         = content[1].split('*')[-1].replace('~','').replace('\n','')   
        SUBMITTED_DATE  = util.parse.date(TOP_ROW[4])
        SENDER_ID       = TOP_ROW[2]
        row = util.split(content[3])
        
        _info = util.get_config(self.config[_code][0],row)    
        
        value = self.get.value(row,_info,VERSION) if _info else {}  
        value['category'] = {"setid": CATEGORY,"version":'X'+VERSION.split('X')[1],"id":VERSION.split('X')[0].strip()} 
        value["submitted"] = SUBMITTED_DATE
        # value['version'] = VERSION
        if _code== '835' :
            value['payer_id'] = SENDER_ID               
        else:
            value['provider_id'] = SENDER_ID
        #
        # Let's parse this for default values            
        return value

    def read(self,filename) :
        """
        :formerly get_content
        This function returns the of the EDI file parsed given the configuration specified. it is capable of identifying a file given the content
        :section    loop prefix (HL, CLP)
        :config     configuration with formatting rules, labels ...
        :filename   location of the file
        """
        # section = section if section else config['SECTION']
        logs    = []
        claims  = []
        try:
            file = open(filename.strip(),errors='ignore')
            INITIAL_ROWS = list(islice(file,4)) #.readlines(4)
            _code = "unknown"
            if len(INITIAL_ROWS) == 1 :
                file = INITIAL_ROWS[0].split('~')
                INITIAL_ROWS = file[:4]
            if len(INITIAL_ROWS) < 3 :
                return None,[{"name":filename,"completed":False}],None
            # section = 'HL' if INITIAL_ROWS[1].split('*')[1] == 'HC' else 'CLP'       
            # _code   = '837' if section == 'HL' else '835'
            # print ([_code,section])
            _code = INITIAL_ROWS[2].split('*')[1].strip()
            # section = 'CLP' if _code == '835' else 'HL'
            section  = self.config[_code][0]['SECTION'].strip()
            #
            # adjusting the 
            DEFAULT_VALUE = self.get.default_value(INITIAL_ROWS,_code)
            DEFAULT_VALUE['name'] = filename.strip()
            #
            # In the initial rows, there's redundant information (so much for x12 standard)
            #   index 1 identifies file type i.e CLM for claim and CLP for remittance
            segment = []
            index = 0;
            _toprows = []
            for row in file :
                row = row.replace('\r','')
                if not segment and not row.startswith(section):
                    _toprows += [row]
                if row.startswith(section) and not segment:
                    
                    segment = [row]

                    continue
                    
                elif segment and not row.startswith(section):
                    
                    segment.append(row)
                
                    
                if len(segment) > 1 and row.startswith(section):
                    #
                    # process the segment somewhere (create a thread maybe?)
                    # 
                    # default_claim = dict({"index":index},**DEFAULT_VALUE)
                    # print (_toprows)
                    _claim = self.apply(segment,_code)
                    
                    # if _claim['claim_id'] == 'P1080351470' :
                    #     print (_claim)
                        # _claim = dict(DEFAULT_VALUE,**_claim)
                    if _claim :
                        _claim['index'] = index #len(claims)
                        claims.append(dict(DEFAULT_VALUE,**_claim))
                    segment = [row]
                    index += 1
                    
                    
                pass
            #
            # Handling the last claim found 
            if segment[0].startswith(section) :
                default_claim = dict({"name":index},**DEFAULT_VALUE)
                
                claim = self.apply(segment,_code)
                if claim :
                    claim['index'] = len(claims)
                    claim = jsonmerge.merge(claim,self.apply(_toprows,_code))
                    claims.append(dict(DEFAULT_VALUE,**claim))
            if type(file) != list :
                file.close()

            # x12_file = open(filename.strip(),errors='ignore').read().split('\n')
        except Exception as e:
           
            logs.append ({"parse":_code,"completed":False,"name":filename,"msg":e.args[0]})
            return [],logs,None
        
        rate = 0 if len(claims) == 0 else (1 + index)/len(claims)
        logs.append ({"parse":"claims" if _code == '837' else 'remits',"completed":True,"name":filename,"rate":rate})                
        # self.finish(claims,logs,_code)
        return claims,logs,_code    
    def run(self):
        if self.emit.pre :
            self.emit.pre()

        for filename in self.files :
            content,logs,_code = self.read(filename)
            self.finish(content,logs,_code)
    def finish(self,content,logs,_code) :
        args = self.store
        _args = json.loads(json.dumps(self.store))
        if args['type'] == 'mongo.MongoWriter' :
            args['args']['doc'] = 'claims' if _code == '837' else 'remits'
            _args['args']['doc'] = 'logs'
        else:
            args['args']['table'] = 'claims' if _code == '837' else 'remits'
            _args['args']['table'] = 'logs'

        if content      :
            writer = transport.factory.instance(**args)
            writer.write(content)
            writer.close()
        if logs :
            
            logger = transport.factory.instance(**_args)
            logger.write(logs)
            
            logger.close()
        if self.emit.post :
            self.emit.post(content,logs)


# p = Parser('/home/steve/.healthcareio/config.json')
# p.set.files(['../../data/small/claims/ssiUB1122042711220427127438.clm_191122T043504'])
# path = '../../data/small/claims/ssiUB1122042711220427127438.clm_191122T043504'
# path = '../../data/small/claims/problems-with-procs'
# path = '../../data/small/remits/1SG03927258.dat_181018T074559'

# _path = "../../data/small/remits/1TR21426701.dat_180703T074559"
# p.start()
# p.join()
# claims,logs = p.read(path)
# print (json.dumps(claims[3]))
# print (logs)