Package checkm :: Module checkm
[hide private]
[frames] | no frames]

Source Code for Module checkm.checkm

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """Checkm class library docs TODO 
  5   
  6   
  7                  [@]SourceFileOrURL  Alg     Digest  Length   ModTime   TargetFileOrURL 
  8  TOKEN NUMBER:    1                  2       3       4        5         6 
  9   
 10  """ 
 11   
 12  from __future__ import with_statement 
 13   
 14  COLUMNS = { 0:"SourceFileOrURL", 
 15              1:"Alg", 
 16              2:"Digest", 
 17              3:"Length", 
 18              4:"ModTime", 
 19              5:"TargetFileOrURL", 
 20              } 
 21   
 22   
 23  import os, sys 
 24  from stat import * 
 25   
 26  import re 
 27   
 28  from collections import defaultdict 
 29   
 30  import hashlib 
 31   
 32  import codecs 
 33   
 34  import logging 
 35   
 36  logging.basicConfig(level=logging.INFO) 
 37   
 38  logger = logging.getLogger('checkm') 
 39   
40 -class NotFound(Exception):
41 """The directory was not found, or is not accessible."""
42 - def __init__(self, *arg, **kw):
43 self.context = (arg, kw)
44 - def __repr__(self):
45 return self.context.__str__()
46
47 -class CheckmReporter(object):
48 COLUMN_NAMES = [u'# [@]SourceFileOrURL',u'Alg',u'Digest',u'Length',u'ModTime']
49 - def __init__(self):
50 self.scanner = CheckmScanner()
51
52 - def _get_max_len(self, report):
53 cols = defaultdict(lambda : 0) 54 for line in report: 55 for index in xrange(len(line)): 56 if len(line[index])>cols[index]: 57 cols[index] = len(line[index]) 58 return cols
59
60 - def _space_line(self, line, col_maxes):
61 spaced_line = [] 62 for index in xrange(len(line)): 63 spaced_line.append(line[index]) 64 spaces = col_maxes[index]-len(line[index])+4 65 spaced_line.append(u" "*spaces) 66 return u"".join(spaced_line)
67
68 - def create_bagit_manifest(self, scan_directory, algorithm, recursive=False, delimiter = " ", filename=None):
69 if not filename: 70 filename = "manifest-%s.txt" % algorithm 71 logger.info("Creating bagit manifest file(%s) for dir(%s) with Alg:%s" % (filename, 72 scan_directory, 73 algorithm)) 74 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=3) 75 if hasattr(filename, 'write'): 76 for line in report: 77 if line[2] != "d": 78 filename.write("%s%s%s\n" % (line[2], delimiter, line[0])) 79 filename.write("\n") 80 else: 81 with codecs.open(filename, encoding='utf-8', mode="w") as output: 82 for line in report: 83 if line[2] != "d": 84 output.write("%s%s%s\n" % (line[2], delimiter, line[0])) 85 output.write("\n") 86 return filename
87
88 - def create_checkm_file(self, scan_directory, algorithm, checkm_filename, recursive=False, columns=3):
89 logger.info("Creating checkm file(%s) for dir(%s) with Alg:%s and columns: %s" % (checkm_filename, 90 scan_directory, 91 algorithm, columns)) 92 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=columns) 93 col_maxes = self._get_max_len(report) 94 if hasattr(checkm_filename, 'write'): 95 checkm_filename.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 96 for line in report: 97 checkm_filename.write("%s\n" % (self._space_line(line, col_maxes))) 98 checkm_filename.write("\n") 99 return checkm_filename 100 else: 101 with codecs.open(checkm_filename, encoding='utf-8', mode="w") as output: 102 output.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 103 for line in report: 104 output.write("%s\n" % (self._space_line(line, col_maxes))) 105 output.write("\n")
106
107 - def check_bagit_hashes(self, bagit_filename, algorithm=None):
108 logger.info("Checking files against '%s' bagit manifest" % bagit_filename) 109 if algorithm == None: 110 if hasattr(bagit_filename, 'read'): 111 raise Exception("Need to supply the algorithm when passing a filelike object instead of a filename") 112 m = re.search("manifest-(?P<alg>[^\.]+)\.txt", bagit_filename) 113 if m != None: 114 algorithm = m.groupdict()['alg'] 115 parser = BagitParser(bagit_filename) 116 scanner = CheckmScanner() 117 results = {'pass':[], 'fail':{}} 118 for row in parser: 119 if row: 120 try: 121 scan_row = scanner.scan_path(row[1], algorithm, 3) 122 if row[0] != scan_row[2]: 123 logger.info("Failed original: %s" % row) 124 logger.info("Current scan: %s" % scan_row) 125 results['fail'][row[1]] = (row, scan_row) 126 else: 127 results['pass'].append(row[1]) 128 except NotFound: 129 scan_row = "File not found" 130 logger.info("Failed original: %s" % row) 131 logger.info("But file not found at this path.") 132 results['fail'][row[1]] = (row, scan_row) 133 return results
134
135 - def check_checkm_hashes(self, scan_directory, checkm_filename):
136 logger.info("Checking files against %s checkm manifest" % checkm_filename) 137 parser = CheckmParser(checkm_filename) 138 scanner = CheckmScanner() 139 results = {'pass':[], 'fail':{}} 140 for row in parser: 141 if row: 142 try: 143 scan_row = scanner.scan_path(row[0], row[1], len(row)) 144 if row != scan_row: 145 logger.info("Failed original: %s" % row) 146 logger.info("Current scan: %s" % scan_row) 147 results['fail'][row[0]] = (row, scan_row) 148 else: 149 results['pass'].append(row[0]) 150 except NotFound: 151 scan_row = "File not found" 152 logger.info("Failed original: %s" % row) 153 logger.info("But file not found at this path.") 154 results['fail'][row[0]] = (row, scan_row) 155 return results
156
157 -class BagitParser(object):
158 - def __init__(self, bagit_file=None):
159 self.status = False 160 self.lines = [] 161 if bagit_file: 162 self.parse(bagit_file)
163
164 - def __iter__(self):
165 class Bagit_iter: 166 def __init__(self, lines): 167 self.lines = lines 168 self.last = 0
169 def __iter__(self): 170 return self
171 def next(self): 172 if self.last >= len(self.lines): # threshhold terminator 173 raise StopIteration 174 elif len(self.lines) == 0: 175 raise StopIteration 176 else: 177 self.last += 1 178 return self.lines[self.last-1] 179 return Bagit_iter(self.lines) 180
181 - def parse(self, fileobj):
182 if not hasattr(fileobj, "read"): 183 with codecs.open(fileobj, encoding='utf-8', mode="r") as check_fh: 184 self._parse_lines(check_fh) 185 else: 186 self._parse_lines(fileobj) 187 return self.lines
188
189 - def _parse_lines(self, fh):
190 self.lines = [] # clear the deck 191 line_buffer = "" 192 def _parse_line(line): 193 if not line.startswith('#'): 194 tokens = filter(lambda x: x, re.split("\s+", line, 1)) # 2 columns 195 logger.info(tokens) 196 if tokens: 197 # handle "\s*\*" situation 198 if tokens[1].startswith("*"): 199 tokens[1] = tokens[1][1:].strip() 200 self.lines.append(tokens)
201 for chunk in fh.read(0x1000): 202 line_buffer = line_buffer + chunk 203 while True: 204 if not line_buffer: 205 break 206 fragments = line_buffer.split('\n',1) 207 if len(fragments) == 1: 208 break 209 _parse_line(fragments[0]) 210 line_buffer = fragments[1] 211
212 -class CheckmParser(object):
213 - def __init__(self, checkm_file=None):
214 self.status = False 215 self.lines = [] 216 if checkm_file: 217 self.parse(checkm_file)
218
219 - def __iter__(self):
220 class Checkm_iter: 221 def __init__(self, lines): 222 self.lines = lines 223 self.last = 0
224 def __iter__(self): 225 return self
226 def next(self): 227 if self.last >= len(self.lines): # threshhold terminator 228 raise StopIteration 229 elif len(self.lines) == 0: 230 raise StopIteration 231 else: 232 self.last += 1 233 return self.lines[self.last-1] 234 return Checkm_iter(self.lines) 235
236 - def parse(self, checkm_file):
237 if not hasattr(checkm_file, "readline"): 238 with codecs.open(checkm_file, encoding='utf-8', mode="r") as check_fh: 239 self._parse_lines(check_fh) 240 else: 241 self._parse_lines(checkm_file) 242 return self.lines
243
244 - def _parse_lines(self, fh):
245 self.lines = [] # clear the deck 246 line_buffer = "" 247 def _parse_line(line): 248 if not line.startswith('#'): 249 tokens = filter(lambda x: x, re.split("\s+", line, 5)) # 6 column max defn == 5 splits 250 logger.info(tokens) 251 if tokens: 252 #self.lines.append(dict([(index, tokens[index]) for index in xrange(len(tokens))])) 253 self.lines.append(tokens)
254 255 for chunk in fh.read(0x1000): 256 line_buffer = line_buffer + chunk 257 while True: 258 if not line_buffer: 259 break 260 fragments = line_buffer.split('\n',1) 261 if len(fragments) == 1: 262 break 263 _parse_line(fragments[0]) 264 line_buffer = fragments[1] 265
266 -class CheckmScanner(object):
267 HASHTYPES = ['md5', 'sha1', 'sha224','sha256','sha384','sha512']
268 - def scan_local(self, directory_path, algorithm, columns=3):
269 report = [] 270 for item in os.listdir(directory_path): 271 item_path = os.path.join(directory_path, item) 272 report.append(self.scan_path(item_path, algorithm, columns)) 273 return report
274
275 - def scan_tree(self, directory_path, algorithm, columns):
276 report = [] 277 if os.path.exists(directory_path): 278 for (dirpath, dirnames, filenames) in os.walk(directory_path): 279 for item_path in [os.path.join(dirpath, x) for x in dirnames+filenames]: 280 report.append(self.scan_path(item_path, algorithm, columns)) 281 return report 282 else: 283 raise NotFound(directory_path=directory_path, recursive=True)
284
285 - def scan_path(self, item_path, algorithm, columns):
286 if columns<3 or not isinstance(columns, int): 287 columns = 3 288 try: 289 line = [] 290 # col 1 291 line.append(unicode(item_path)) 292 # col 2 293 line.append(unicode(algorithm)) 294 # col 3 295 if os.path.isdir(item_path): 296 line.append(u'd') 297 else: 298 # No need to catch the ValueError from 299 hash_gen = getattr(hashlib, algorithm)() 300 with open(item_path, 'rb') as fh: 301 logger.info("Checking %s with algorithm %s" % (item_path, algorithm)) 302 chunk = fh.read(1024*8) 303 while chunk: 304 hash_gen.update(chunk) 305 chunk= fh.read(1024*8) 306 line.append(unicode(hash_gen.hexdigest())) 307 if columns>3: 308 # col4 - Length 309 line.append(unicode(os.stat(item_path)[ST_SIZE])) 310 if columns>4: 311 # col 5 - ModTime 312 line.append(unicode(os.stat(item_path)[ST_MTIME])) 313 return line 314 except OSError: 315 raise NotFound(item_path=item_path) 316 except IOError: 317 raise NotFound(item_path=item_path) 318 except AttributeError: 319 raise ValueError("This tool cannot perform hashtype %s" % algorithm)
320
321 - def scan_directory(self, directory_path, algorithm, recursive=False, columns=3):
322 if os.path.exists(directory_path): 323 if recursive: 324 return self.scan_tree(directory_path, algorithm, columns) 325 return self.scan_local(directory_path, algorithm, columns) 326 else: 327 raise NotFound(directory_path=directory_path, recursive=recursive)
328