123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- """
- This file contains class and functions that extract data from running processes like top and stores them into a data store of the calling codes choice
- dependencies:
- - top (on the os)
- @TODO:
- Test this thing on windows to see if it works
- """
- import pandas as pd
- import numpy as np
- import subprocess
- import os
- import datetime
- # from transport import factory
- import sys
- import hashlib
- class Util:
-
- def app(self,stream):
- """
- Formatting application name, sometimes the name has parameters os separators ...
- """
- index = 1 if os.path.exists(" ".join(stream[:1])) else len(stream)-1
- cmd = " ".join(stream[:index]) if index > 0 else " ".join(stream)
-
- if ' ' in cmd.split(os.sep)[len(cmd.split(os.sep))-1] :
- p = cmd.split(os.sep)[len(cmd.split(os.sep))-1].split(' ')
- name = p[0]
- args = " ".join(p[1:])
- else:
- name = cmd.split('/')[len(cmd.split(os.sep))-1]
- args = " ".join(stream[index:]) if index > 0 else ""
-
- return [name,cmd,args]
- def parse(self,rows,xchar=';'):
- """
- This function parses the document returned by the execution of the command returns a document that will have to be parsed and formatted
- """
- m = []
- TIME_INDEX = 5
- ARGS_INDEX = 6
-
- for item in rows :
- if rows.index(item) != 0 :
- parts = item.split(xchar)
- row = parts[:TIME_INDEX]
- row.append(' '.join(parts[TIME_INDEX:ARGS_INDEX]))
- row += self.app(parts[ARGS_INDEX:])
- else:
- row = item.split(xchar)
- row = (xchar.join(row)).strip()
- if len(row.replace(";","")) > 0 :
- m.append(row)
- return m
-
-
- def read(**args) :
- """
- This function will perform the actual reads of process informations.
- @return {user,pid,start,status, name, args, mem,cpu}
- """
- cmd = "ps -eo pid,user,pmem,pcpu,stat,etime,args|awk 'OFS=\";\" {$1=$1; if($5 > 9) print }'"
- xchar = ";"
- try:
- handler = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
- stream = handler.communicate()[0]
- if sys.version_info[0] > 2 :
- rows = str(stream).split('\\n')
- else:
- rows = stream.split('\n')
-
- formatter = Util()
- m = formatter.parse(rows)
-
- d = datetime.datetime.now().strftime('%m-%d-%Y')
- t = datetime.datetime.now().strftime('%H:%M:%S')
- m = [item for item in m if len(item) != len (m[0])]
- m = "\n".join(m[1:])
- df = pd.read_csv(pd.compat.StringIO(m),sep=xchar)
- df['date'] = np.repeat(d,df.shape[0])
- df['time'] = np.repeat(t,df.shape[0])
- df['node'] = np.repeat(os.uname()[1],df.shape[0])
- df.columns =['pid','user','mem','cpu','status','started','name','cmd','args','date','time','node']
-
- #
- # We should filter the name of the apps we are interested in here (returning the full logs )
- # @TODO: Add filter here to handle filter on different columns
- #
- if 'name' in args :
- names = args['name'].split(',')
- r = pd.DataFrame()
- for name in names :
- tmp = df[df.name == name.strip()]
- if tmp.shape[0] :
- r = r.append(tmp)
- df = r
- #
- # For security reasons lets has the args columns with an MD5 or sha256
- #
-
-
- df.args = [hashlib.md5(str(value).encode('utf-8')).hexdigest() for value in df.args.tolist()]
- STATUS = {'R':'RUNNING','Z':'DEAD','D':'STASIS','S':'SLEEP','Sl':'SLEEP','Ss':'SLEEP','W':'PAGING','T':'DEAD'}
- df.status = df.status.apply(lambda value: STATUS.get(value,'UNKNOWN'))
- if 'cols' in args :
- _cols = list(set(df.columns.tolist()) & set(args['cols']))
- if _cols :
- df = df[_cols]
- #
- # we return a list of objects (no data-frames)
- if 'logger' in args and args['logger'] != None :
- logger = args['logger']
- logger(data=df)
- return df.to_dict(orient='records')
- except Exception as e:
- print (e)
- pass
-
- if __name__ == '__main__' :
- #
- # Being directly called (external use of the )
- print(read())
|