Package checkm :: Module checkm
[hide private]
[frames] | no frames]

Source Code for Module checkm.checkm

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """Checksumming convenience classes 
  5   
  6  TODO! Sorry! 
  7   
  8                  [@]SourceFileOrURL  Alg     Digest  Length   ModTime   TargetFileOrURL 
  9  TOKEN NUMBER:    1                  2       3       4        5         6 
 10   
 11  """ 
 12   
 13  from __future__ import with_statement 
 14   
 15  COLUMNS = { 0:"SourceFileOrURL", 
 16              1:"Alg", 
 17              2:"Digest", 
 18              3:"Length", 
 19              4:"ModTime", 
 20              5:"TargetFileOrURL", 
 21              } 
 22   
 23   
 24  import os, sys 
 25  from stat import * 
 26   
 27  import re 
 28   
 29  from collections import defaultdict 
 30   
 31  import hashlib 
 32   
 33  import codecs 
 34   
 35  import logging 
 36   
 37  logging.basicConfig(level=logging.INFO) 
 38   
 39  logger = logging.getLogger('checkm') 
 40   
41 -class NotFound(Exception):
42 """The item or directory was either not found, or not accessible."""
43 - def __init__(self, *arg, **kw):
44 """ 45 FIXME 46 @param *arg: 47 @type *arg: 48 @param **kw: 49 @type **kw: 50 """ 51 self.context = (arg, kw)
52 - def __repr__(self):
53 """ 54 FIXME 55 """ 56 return self.context.__str__()
57 - def __str__(self):
58 """ 59 FIXME 60 """ 61 return self.context.__str__()
62
63 -class CheckmReporter(object):
64 COLUMN_NAMES = [u'# [@]SourceFileOrURL',u'Alg',u'Digest',u'Length',u'ModTime']
65 - def __init__(self):
66 """ 67 FIXME 68 """ 69 self.scanner = CheckmScanner()
70
71 - def _get_max_len(self, report):
72 """ 73 FIXME 74 @param report: 75 @type report: 76 """ 77 cols = defaultdict(lambda : 0) 78 for line in report: 79 for index in xrange(len(line)): 80 if len(line[index])>cols[index]: 81 cols[index] = len(line[index]) 82 return cols
83
84 - def _space_line(self, line, col_maxes):
85 """ 86 FIXME 87 @param line: 88 @type line: 89 @param col_maxes: 90 @type col_maxes: 91 """ 92 spaced_line = [] 93 for index in xrange(len(line)): 94 spaced_line.append(line[index]) 95 spaces = col_maxes[index]-len(line[index])+4 96 spaced_line.append(u" "*spaces) 97 return u"".join(spaced_line)
98
99 - def create_bagit_manifest(self, scan_directory, algorithm, recursive=False, delimiter = " ", filename=None):
100 """ 101 FIXME 102 @param scan_directory: 103 @type scan_directory: 104 @param algorithm: 105 @type algorithm: 106 @param recursive=False: 107 @type recursive=False: 108 @param delimiter: 109 @type delimiter: 110 @param filename=None: 111 @type filename=None: 112 """ 113 if not filename: 114 filename = "manifest-%s.txt" % algorithm 115 logger.info("Creating bagit manifest file(%s) for dir(%s) with Alg:%s" % (filename, 116 scan_directory, 117 algorithm)) 118 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=3) 119 if hasattr(filename, 'write'): 120 faked_filename = "manifest-%s.txt" % algorithm 121 for line in report: 122 if line[2] != "d": 123 if os.path.abspath(line[0]) != os.path.abspath(faked_filename): 124 filename.write("%s%s%s\n" % (line[2], delimiter, line[0])) 125 else: 126 logger.info("Manifest file match - scan line ignored") 127 else: 128 with codecs.open(filename, encoding='utf-8', mode="w") as output: 129 for line in report: 130 if line[2] != "d": 131 if os.path.abspath(line[0]) != os.path.abspath(filename): 132 output.write("%s%s%s\n" % (line[2], delimiter, line[0])) 133 else: 134 logger.info("Manifest file match - scan line ignored") 135 output.write("\n") 136 return filename
137
138 - def create_multilevel_checkm(self, top_directory, algorithm, checkm_filename, columns=3):
139 logger.info("Creating multilevel checkm files '(%s)' from top level directory(%s) with Alg:%s and columns:%s" % (checkm_filename, top_directory, algorithm, columns)) 140 if not os.path.isdir(top_directory): 141 raise NotFound(top_directory=top_directory) 142 # Gather list of directories to scan 143 # And their subdirectories 144 # bottom up! 145 dir_list = [(root, dirnames) for (root, dirnames, _) in os.walk(top_directory, topdown=False)] 146 dirs = dict(dir_list) 147 # per directory 148 for (dirname,_) in dir_list: 149 logger.info('creating checkm file %s in %s' % (checkm_filename, dirname)) 150 with codecs.open(os.path.join(dirname, checkm_filename), encoding='utf-8', mode="w") as output: 151 self.create_checkm_file(dirname, 152 algorithm, 153 os.path.join(dirname, checkm_filename), 154 recursive=False, 155 columns=columns, 156 checkm_file=output) 157 subdir_report = [] 158 for subdir in dirs[dirname]: 159 logger.info('Checking sub-checkm file and adding it to the list of hashes in %s' % dirname) 160 try: 161 line = self.scanner.scan_path(os.path.join(dirname, subdir, checkm_filename), algorithm, columns) 162 logger.info("Line - %s" % line) 163 line[0] = '@%s' % (line[0]) 164 subdir_report.append(line) 165 except Exception, e: 166 print dirname, subdir, checkm_filename 167 print "Fail! %s" % e 168 col_maxes = self._get_max_len(subdir_report) 169 for line in subdir_report: 170 output.write('%s\n' % (self._space_line(line, col_maxes))) 171 output.write('\n')
172
173 - def create_checkm_file(self, scan_directory, algorithm, checkm_filename, recursive=False, columns=3, checkm_file=None):
174 logger.info("Creating checkm file for dir(%s) with Alg:%s and columns: %s" % ( 175 scan_directory, 176 algorithm, columns)) 177 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=columns) 178 col_maxes = self._get_max_len(report) 179 if checkm_file != None and hasattr(checkm_file, 'write'): 180 checkm_file.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 181 for line in report: 182 if os.path.abspath(line[0]) != os.path.abspath(checkm_filename): 183 checkm_file.write("%s\n" % (self._space_line(line, col_maxes))) 184 else: 185 logger.info("Manifest file match - scan line ignored") 186 return checkm_file 187 else: 188 with codecs.open(checkm_filename, encoding='utf-8', mode="w") as output: 189 output.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 190 for line in report: 191 if os.path.abspath(line[0]) != os.path.abspath(checkm_filename): 192 output.write("%s\n" % (self._space_line(line, col_maxes))) 193 else: 194 logger.info("Manifest file match - scan line ignored") 195 output.write("\n")
196
197 - def check_bagit_hashes(self, bagit_filename, algorithm=None):
198 """ 199 FIXME 200 @param bagit_filename: 201 @type bagit_filename: 202 @param algorithm=None: 203 @type algorithm=None: 204 """ 205 logger.info("Checking files against '%s' bagit manifest" % bagit_filename) 206 if algorithm == None: 207 if hasattr(bagit_filename, 'read'): 208 raise Exception("Need to supply the algorithm when passing a filelike object instead of a filename") 209 m = re.search("manifest-(?P<alg>[^\.]+)\.txt", bagit_filename) 210 if m != None: 211 algorithm = m.groupdict()['alg'] 212 parser = BagitParser(bagit_filename) 213 scanner = CheckmScanner() 214 results = {'pass':[], 'fail':{}} 215 for row in parser: 216 if row: 217 try: 218 scan_row = scanner.scan_path(row[1], algorithm, 3) 219 if row[0] != scan_row[2]: 220 logger.info("Failed original: %s" % row) 221 logger.info("Current scan: %s" % scan_row) 222 results['fail'][row[1]] = (row, scan_row) 223 else: 224 results['pass'].append(row[1]) 225 except NotFound: 226 scan_row = "File not found" 227 logger.info("Failed original: %s" % row) 228 logger.info("But file not found at this path.") 229 results['fail'][row[1]] = (row, scan_row) 230 return results
231
232 - def check_checkm_hashes(self, scan_directory, checkm_filename, ignore_multilevel=True, columns=None):
233 """ 234 FIXME 235 @param scan_directory: 236 @type scan_directory: 237 @param checkm_filename: 238 @type checkm_filename: 239 """ 240 def _check_files_against_parser(parser, columns=None): 241 scanner = CheckmScanner() 242 results = {'pass':[], 'fail':{}, 'include':[]} 243 for row in parser: 244 if row: 245 try: 246 if row[0].startswith('@'): 247 row[0] = row[0][1:] 248 results['include'].append(row[0]) 249 if not columns: 250 columns = len(row) 251 scan_row = scanner.scan_path(row[0], row[1], columns) 252 nomatch = False 253 for expected, scanned in zip(row, scan_row): 254 if expected != "-" and expected != scanned: 255 nomatch = True 256 if nomatch: 257 logger.info("Failed original: %s" % row) 258 logger.info("Current scan: %s" % scan_row) 259 results['fail'][row[0]] = (row, scan_row) 260 else: 261 results['pass'].append(row[0]) 262 except NotFound: 263 scan_row = "File not found" 264 logger.info("Failed original: %s" % row) 265 logger.info("But file not found at this path.") 266 results['fail'][row[0]] = (row, scan_row) 267 return results
268 269 logger.info("Checking files against %s checkm manifest" % checkm_filename) 270 parser = CheckmParser(checkm_filename) 271 results = _check_files_against_parser(parser, columns) 272 if ignore_multilevel: 273 return results 274 else: 275 # shallow copy of the include list, as we will be pop'ing off items 276 checkm_list = results['include'][:] 277 while checkm_list: 278 checkm_file = checkm_list.pop() 279 parser = CheckmParser(checkm_file) 280 additional_results = _check_files_against_parser(parser, columns) 281 # Add to the passes 282 results['pass'].extend(additional_results['pass']) 283 # add to the overall list of 284 results['include'].extend(additional_results['include']) 285 checkm_list.extend(additional_results['include']) 286 # add to the fail dict 287 results['fail'].update(additional_results['fail']) 288 return results
289
290 -class BagitParser(object):
291 - def __init__(self, bagit_file=None):
292 """ 293 FIXME 294 @param bagit_file=None: 295 @type bagit_file=None: 296 """ 297 self.status = False 298 self.lines = [] 299 if bagit_file: 300 self.parse(bagit_file)
301
302 - def __iter__(self):
303 """ 304 FIXME 305 """ 306 class Bagit_iter: 307 def __init__(self, lines): 308 """ 309 FIXME 310 @param lines: 311 @type lines: 312 """ 313 self.lines = lines 314 self.last = 0
315 def __iter__(self): 316 """ 317 FIXME 318 """ 319 return self
320 def next(self): 321 """ 322 FIXME 323 """ 324 if self.last >= len(self.lines): # threshhold terminator 325 raise StopIteration 326 elif len(self.lines) == 0: 327 raise StopIteration 328 else: 329 self.last += 1 330 return self.lines[self.last-1] 331 return Bagit_iter(self.lines) 332
333 - def parse(self, fileobj):
334 """ 335 FIXME 336 @param fileobj: 337 @type fileobj: 338 """ 339 if not hasattr(fileobj, "read"): 340 with codecs.open(fileobj, encoding='utf-8', mode="r") as check_fh: 341 self._parse_lines(check_fh) 342 else: 343 self._parse_lines(fileobj) 344 return self.lines
345
346 - def _parse_lines(self, fh):
347 """ 348 FIXME 349 @param fh: 350 @type fh: 351 """ 352 self.lines = [] # clear the deck 353 line_buffer = "" 354 def _parse_line(line): 355 """ 356 FIXME 357 @param line: 358 @type line: 359 """ 360 if not line.startswith('#'): 361 tokens = filter(lambda x: x, re.split("\s+", line, 1)) # 2 columns 362 logger.info(tokens) 363 if tokens: 364 # handle "\s*\*" situation 365 if tokens[1].startswith("*"): 366 tokens[1] = tokens[1][1:].strip() 367 self.lines.append(tokens)
368 for chunk in fh.read(0x1000): 369 line_buffer = line_buffer + chunk 370 while True: 371 if not line_buffer: 372 break 373 fragments = line_buffer.split('\n',1) 374 if len(fragments) == 1: 375 break 376 _parse_line(fragments[0]) 377 line_buffer = fragments[1] 378
379 -class CheckmParser(object):
380 - def __init__(self, checkm_file=None):
381 """ 382 FIXME 383 @param checkm_file=None: 384 @type checkm_file=None: 385 """ 386 self.status = False 387 self.lines = [] 388 if checkm_file: 389 self.parse(checkm_file)
390
391 - def __iter__(self):
392 """ 393 FIXME 394 """ 395 class Checkm_iter: 396 def __init__(self, lines): 397 """ 398 FIXME 399 @param lines: 400 @type lines: 401 """ 402 self.lines = lines 403 self.last = 0
404 def __iter__(self): 405 """ 406 FIXME 407 """ 408 return self
409 def next(self): 410 """ 411 FIXME 412 """ 413 if self.last >= len(self.lines): # threshhold terminator 414 raise StopIteration 415 elif len(self.lines) == 0: 416 raise StopIteration 417 else: 418 self.last += 1 419 return self.lines[self.last-1] 420 return Checkm_iter(self.lines) 421
422 - def parse(self, checkm_file):
423 """ 424 FIXME 425 @param checkm_file: 426 @type checkm_file: 427 """ 428 if not hasattr(checkm_file, "read"): 429 if os.path.isfile(checkm_file): 430 with codecs.open(checkm_file, encoding='utf-8', mode="r") as check_fh: 431 self._parse_lines(check_fh) 432 else: 433 raise NotFound(checkm_file=checkm_file) 434 else: 435 self._parse_lines(checkm_file) 436 return self.lines
437
438 - def _parse_lines(self, fh):
439 """ 440 FIXME 441 @param fh: 442 @type fh: 443 """ 444 self.lines = [] # clear the deck 445 line_buffer = "" 446 def _parse_line(line): 447 """ 448 FIXME 449 @param line: 450 @type line: 451 """ 452 if not line.startswith('#'): 453 tokens = filter(lambda x: x, re.split("\s+", line, 5)) # 6 column max defn == 5 splits 454 logger.info(tokens) 455 if tokens: 456 self.lines.append(tokens)
457 458 for chunk in fh.read(0x1000): 459 line_buffer = line_buffer + chunk 460 while True: 461 if not line_buffer: 462 break 463 fragments = line_buffer.split('\n',1) 464 if len(fragments) == 1: 465 break 466 _parse_line(fragments[0]) 467 line_buffer = fragments[1] 468
469 -class CheckmScanner(object):
470 HASHTYPES = ['md5', 'sha1', 'sha224','sha256','sha384','sha512']
471 - def scan_local(self, directory_path, algorithm, columns=3):
472 """ 473 FIXME 474 @param directory_path: 475 @type directory_path: 476 @param algorithm: 477 @type algorithm: 478 @param columns=3: 479 @type columns=3: 480 """ 481 report = [] 482 for item in os.listdir(directory_path): 483 item_path = os.path.join(directory_path, item) 484 report.append(self.scan_path(item_path, algorithm, columns)) 485 return report
486
487 - def scan_tree(self, directory_path, algorithm, columns):
488 """ 489 FIXME 490 @param directory_path: 491 @type directory_path: 492 @param algorithm: 493 @type algorithm: 494 @param columns: 495 @type columns: 496 """ 497 report = [] 498 if os.path.exists(directory_path): 499 for (dirpath, dirnames, filenames) in os.walk(directory_path): 500 for item_path in [os.path.join(dirpath, x) for x in dirnames+filenames]: 501 report.append(self.scan_path(item_path, algorithm, columns)) 502 return report 503 else: 504 raise NotFound(directory_path=directory_path, recursive=True)
505
506 - def scan_path(self, item_path, algorithm, columns):
507 """ 508 FIXME 509 @param item_path: 510 @type item_path: 511 @param algorithm: 512 @type algorithm: 513 @param columns: 514 @type columns: 515 """ 516 if columns<3 or not isinstance(columns, int): 517 columns = 3 518 try: 519 line = [] 520 # col 1 521 line.append(unicode(item_path)) 522 # col 2 523 line.append(unicode(algorithm)) 524 # col 3 525 if os.path.isdir(item_path): 526 line.append(u'd') 527 else: 528 # No need to catch the ValueError from 529 hash_gen = getattr(hashlib, algorithm)() 530 with open(item_path, 'rb') as fh: 531 logger.info("Checking %s with algorithm %s" % (item_path, algorithm)) 532 chunk = fh.read(1024*8) 533 while chunk: 534 hash_gen.update(chunk) 535 chunk= fh.read(1024*8) 536 line.append(unicode(hash_gen.hexdigest())) 537 if columns>3: 538 # col4 - Length 539 line.append(unicode(os.stat(item_path)[ST_SIZE])) 540 if columns>4: 541 # col 5 - ModTime 542 line.append(unicode(os.stat(item_path)[ST_MTIME])) 543 return line 544 except OSError: 545 logger.info("item exists? %s" % os.path.exists(item_path)) 546 raise NotFound(item_path=item_path) 547 except IOError: 548 logger.info("item exists? %s" % os.path.exists(item_path)) 549 raise NotFound(item_path=item_path) 550 except AttributeError: 551 raise ValueError("This tool cannot perform hashtype %s" % algorithm)
552
553 - def scan_directory(self, directory_path, algorithm, recursive=False, columns=3):
554 """ 555 FIXME 556 @param directory_path: 557 @type directory_path: 558 @param algorithm: 559 @type algorithm: 560 @param recursive=False: 561 @type recursive=False: 562 @param columns=3: 563 @type columns=3: 564 """ 565 if os.path.exists(directory_path): 566 if recursive: 567 return self.scan_tree(directory_path, algorithm, columns) 568 return self.scan_local(directory_path, algorithm, columns) 569 else: 570 raise NotFound(directory_path=directory_path, recursive=recursive)
571