1
2
3
4 """Checkm class library docs TODO
5
6
7 [@]SourceFileOrURL Alg Digest Length ModTime TargetFileOrURL
8 TOKEN NUMBER: 1 2 3 4 5 6
9
10 """
11
12 from __future__ import with_statement
13
14 COLUMNS = { 0:"SourceFileOrURL",
15 1:"Alg",
16 2:"Digest",
17 3:"Length",
18 4:"ModTime",
19 5:"TargetFileOrURL",
20 }
21
22
23 import os, sys
24 from stat import *
25
26 import re
27
28 from collections import defaultdict
29
30 import hashlib
31
32 import codecs
33
34 import logging
35
36 logging.basicConfig(level=logging.INFO)
37
38 logger = logging.getLogger('checkm')
39
41 """The directory was not found, or is not accessible."""
43 self.context = (arg, kw)
45 return self.context.__str__()
46
48 COLUMN_NAMES = [u'# [@]SourceFileOrURL',u'Alg',u'Digest',u'Length',u'ModTime']
51
53 cols = defaultdict(lambda : 0)
54 for line in report:
55 for index in xrange(len(line)):
56 if len(line[index])>cols[index]:
57 cols[index] = len(line[index])
58 return cols
59
61 spaced_line = []
62 for index in xrange(len(line)):
63 spaced_line.append(line[index])
64 spaces = col_maxes[index]-len(line[index])+4
65 spaced_line.append(u" "*spaces)
66 return u"".join(spaced_line)
67
68 - def create_bagit_manifest(self, scan_directory, algorithm, recursive=False, delimiter = " ", filename=None):
69 if not filename:
70 filename = "manifest-%s.txt" % algorithm
71 logger.info("Creating bagit manifest file(%s) for dir(%s) with Alg:%s" % (filename,
72 scan_directory,
73 algorithm))
74 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=3)
75 if hasattr(filename, 'write'):
76 for line in report:
77 if line[2] != "d":
78 filename.write("%s%s%s\n" % (line[2], delimiter, line[0]))
79 filename.write("\n")
80 else:
81 with codecs.open(filename, encoding='utf-8', mode="w") as output:
82 for line in report:
83 if line[2] != "d":
84 output.write("%s%s%s\n" % (line[2], delimiter, line[0]))
85 output.write("\n")
86 return filename
87
88 - def create_checkm_file(self, scan_directory, algorithm, checkm_filename, recursive=False, columns=3):
89 logger.info("Creating checkm file(%s) for dir(%s) with Alg:%s and columns: %s" % (checkm_filename,
90 scan_directory,
91 algorithm, columns))
92 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=columns)
93 col_maxes = self._get_max_len(report)
94 if hasattr(checkm_filename, 'write'):
95 checkm_filename.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes)))
96 for line in report:
97 checkm_filename.write("%s\n" % (self._space_line(line, col_maxes)))
98 checkm_filename.write("\n")
99 return checkm_filename
100 else:
101 with codecs.open(checkm_filename, encoding='utf-8', mode="w") as output:
102 output.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes)))
103 for line in report:
104 output.write("%s\n" % (self._space_line(line, col_maxes)))
105 output.write("\n")
106
108 logger.info("Checking files against '%s' bagit manifest" % bagit_filename)
109 if algorithm == None:
110 if hasattr(bagit_filename, 'read'):
111 raise Exception("Need to supply the algorithm when passing a filelike object instead of a filename")
112 m = re.search("manifest-(?P<alg>[^\.]+)\.txt", bagit_filename)
113 if m != None:
114 algorithm = m.groupdict()['alg']
115 parser = BagitParser(bagit_filename)
116 scanner = CheckmScanner()
117 results = {'pass':[], 'fail':{}}
118 for row in parser:
119 if row:
120 try:
121 scan_row = scanner.scan_path(row[1], algorithm, 3)
122 if row[0] != scan_row[2]:
123 logger.info("Failed original: %s" % row)
124 logger.info("Current scan: %s" % scan_row)
125 results['fail'][row[1]] = (row, scan_row)
126 else:
127 results['pass'].append(row[1])
128 except NotFound:
129 scan_row = "File not found"
130 logger.info("Failed original: %s" % row)
131 logger.info("But file not found at this path.")
132 results['fail'][row[1]] = (row, scan_row)
133 return results
134
136 logger.info("Checking files against %s checkm manifest" % checkm_filename)
137 parser = CheckmParser(checkm_filename)
138 scanner = CheckmScanner()
139 results = {'pass':[], 'fail':{}}
140 for row in parser:
141 if row:
142 try:
143 scan_row = scanner.scan_path(row[0], row[1], len(row))
144 if row != scan_row:
145 logger.info("Failed original: %s" % row)
146 logger.info("Current scan: %s" % scan_row)
147 results['fail'][row[0]] = (row, scan_row)
148 else:
149 results['pass'].append(row[0])
150 except NotFound:
151 scan_row = "File not found"
152 logger.info("Failed original: %s" % row)
153 logger.info("But file not found at this path.")
154 results['fail'][row[0]] = (row, scan_row)
155 return results
156
159 self.status = False
160 self.lines = []
161 if bagit_file:
162 self.parse(bagit_file)
163
165 class Bagit_iter:
166 def __init__(self, lines):
167 self.lines = lines
168 self.last = 0
169 def __iter__(self):
170 return self
171 def next(self):
172 if self.last >= len(self.lines):
173 raise StopIteration
174 elif len(self.lines) == 0:
175 raise StopIteration
176 else:
177 self.last += 1
178 return self.lines[self.last-1]
179 return Bagit_iter(self.lines)
180
181 - def parse(self, fileobj):
182 if not hasattr(fileobj, "read"):
183 with codecs.open(fileobj, encoding='utf-8', mode="r") as check_fh:
184 self._parse_lines(check_fh)
185 else:
186 self._parse_lines(fileobj)
187 return self.lines
188
190 self.lines = []
191 line_buffer = ""
192 def _parse_line(line):
193 if not line.startswith('#'):
194 tokens = filter(lambda x: x, re.split("\s+", line, 1))
195 logger.info(tokens)
196 if tokens:
197
198 if tokens[1].startswith("*"):
199 tokens[1] = tokens[1][1:].strip()
200 self.lines.append(tokens)
201 for chunk in fh.read(0x1000):
202 line_buffer = line_buffer + chunk
203 while True:
204 if not line_buffer:
205 break
206 fragments = line_buffer.split('\n',1)
207 if len(fragments) == 1:
208 break
209 _parse_line(fragments[0])
210 line_buffer = fragments[1]
211
214 self.status = False
215 self.lines = []
216 if checkm_file:
217 self.parse(checkm_file)
218
220 class Checkm_iter:
221 def __init__(self, lines):
222 self.lines = lines
223 self.last = 0
224 def __iter__(self):
225 return self
226 def next(self):
227 if self.last >= len(self.lines):
228 raise StopIteration
229 elif len(self.lines) == 0:
230 raise StopIteration
231 else:
232 self.last += 1
233 return self.lines[self.last-1]
234 return Checkm_iter(self.lines)
235
236 - def parse(self, checkm_file):
237 if not hasattr(checkm_file, "readline"):
238 with codecs.open(checkm_file, encoding='utf-8', mode="r") as check_fh:
239 self._parse_lines(check_fh)
240 else:
241 self._parse_lines(checkm_file)
242 return self.lines
243
245 self.lines = []
246 line_buffer = ""
247 def _parse_line(line):
248 if not line.startswith('#'):
249 tokens = filter(lambda x: x, re.split("\s+", line, 5))
250 logger.info(tokens)
251 if tokens:
252
253 self.lines.append(tokens)
254
255 for chunk in fh.read(0x1000):
256 line_buffer = line_buffer + chunk
257 while True:
258 if not line_buffer:
259 break
260 fragments = line_buffer.split('\n',1)
261 if len(fragments) == 1:
262 break
263 _parse_line(fragments[0])
264 line_buffer = fragments[1]
265
267 HASHTYPES = ['md5', 'sha1', 'sha224','sha256','sha384','sha512']
268 - def scan_local(self, directory_path, algorithm, columns=3):
269 report = []
270 for item in os.listdir(directory_path):
271 item_path = os.path.join(directory_path, item)
272 report.append(self.scan_path(item_path, algorithm, columns))
273 return report
274
275 - def scan_tree(self, directory_path, algorithm, columns):
276 report = []
277 if os.path.exists(directory_path):
278 for (dirpath, dirnames, filenames) in os.walk(directory_path):
279 for item_path in [os.path.join(dirpath, x) for x in dirnames+filenames]:
280 report.append(self.scan_path(item_path, algorithm, columns))
281 return report
282 else:
283 raise NotFound(directory_path=directory_path, recursive=True)
284
285 - def scan_path(self, item_path, algorithm, columns):
286 if columns<3 or not isinstance(columns, int):
287 columns = 3
288 try:
289 line = []
290
291 line.append(unicode(item_path))
292
293 line.append(unicode(algorithm))
294
295 if os.path.isdir(item_path):
296 line.append(u'd')
297 else:
298
299 hash_gen = getattr(hashlib, algorithm)()
300 with open(item_path, 'rb') as fh:
301 logger.info("Checking %s with algorithm %s" % (item_path, algorithm))
302 chunk = fh.read(1024*8)
303 while chunk:
304 hash_gen.update(chunk)
305 chunk= fh.read(1024*8)
306 line.append(unicode(hash_gen.hexdigest()))
307 if columns>3:
308
309 line.append(unicode(os.stat(item_path)[ST_SIZE]))
310 if columns>4:
311
312 line.append(unicode(os.stat(item_path)[ST_MTIME]))
313 return line
314 except OSError:
315 raise NotFound(item_path=item_path)
316 except IOError:
317 raise NotFound(item_path=item_path)
318 except AttributeError:
319 raise ValueError("This tool cannot perform hashtype %s" % algorithm)
320
321 - def scan_directory(self, directory_path, algorithm, recursive=False, columns=3):
322 if os.path.exists(directory_path):
323 if recursive:
324 return self.scan_tree(directory_path, algorithm, columns)
325 return self.scan_local(directory_path, algorithm, columns)
326 else:
327 raise NotFound(directory_path=directory_path, recursive=recursive)
328