__init__.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. """
  2. This file is designed to retrieve information on a folder
  3. {files,size,hash}
  4. """
  5. import subprocess
  6. import sys
  7. import re
  8. import os
  9. import pandas as pd
  10. import io
  11. import datetime
  12. class Util :
  13. def size(self,stream):
  14. PATTERN = '(^.+)([A-Z]+$)'
  15. value,units = re.match('^(.+)([A-Z]+$)',stream).groups()
  16. value = float(value)
  17. if 'G' == units :
  18. value *= 1000
  19. elif 'K' == units:
  20. value /= 1000
  21. units = 'MB'
  22. return {"size":value,"units":units}
  23. def content(self,stream):
  24. return {"content":stream.split(' ')[0].strip()}
  25. def read(**args):
  26. """
  27. The path can also take in regular expressions
  28. """
  29. cmd = {"size":"du -sh :path","content":"find :path -type f -exec md5sum {} + | sort -z|md5sum"}
  30. r = {}
  31. util = Util()
  32. for key in cmd :
  33. _cmd = cmd[key]
  34. handler = subprocess.Popen(_cmd.replace(':path',args['path']),shell=True,stdout=subprocess.PIPE,encoding='utf-8')
  35. stream = handler.communicate()[0]
  36. if sys.version_info[0] > 2 :
  37. rows = str(stream).split('\n')
  38. else:
  39. rows = stream.split('\n')
  40. if key == 'size' :
  41. rows = rows[0]
  42. rows = util.size(rows.split('\t')[0])
  43. elif key == 'content' :
  44. #
  45. # There is a hash key that is generated and should be extracted
  46. rows = rows[0]
  47. rows = util.content(rows)
  48. r = dict(r, **rows)
  49. r['path'] = args['path']
  50. r['name'] = args['path'].split(os.sep)[-1:][0]
  51. r['node'] = os.uname()[1]
  52. r['date'] = datetime.datetime.now().strftime('%m-%d-%Y')
  53. r['time'] = datetime.datetime.now().strftime('%H:%M:%S')
  54. return pd.DataFrame([r])
  55. pass