1
2 """
3 Retrieve all CSS stylesheets including embedded for a given URL.
4 Retrieve as StyleSheetList or save to disk.
5
6 TODO:
7 @import
8 save all
9
10 maybe use DOM 3 load/save?
11
12 logger class which handles all cases when no log is given...
13
14 saveto:
15 why does urllib2 hang?
16 """
17 __all__ = ['CSSCapture']
18 __docformat__ = 'restructuredtext'
19 __author__ = '$LastChangedBy: cthedot $'
20 __date__ = '$LastChangedDate: 2007-10-27 22:48:36 +0200 (Sa, 27 Okt 2007) $'
21 __version__ = '$LastChangedRevision: 594 $'
22
23 import errno
24 import HTMLParser
25 import logging
26 import os
27 import sys
28 import urllib
29 import urllib2
30 import urlparse
31 import xml.dom
32
33 import cssutils
34 from cssutils import css, stylesheets
35
36 try:
37 import encutils
38 except ImportError:
39 try:
40 import cssutils.encutils as encutils
41 except ImportError:
42 sys.exit("You need encutils from http://cthedot.de/encutils/")
43
45 """ parses given data for link and style elements """
46 curtag = u''
47 links = []
48
49 styles = []
50
51
53 return dict([(a.lower(), v.lower()) for a, v in attrs])
54
56 if tag == u'link':
57 attrs = self._lowerattrs(attrs)
58 if attrs.get(u'type', u'') == u'text/css':
59 self.links.append(attrs)
60
61 elif tag == u'style':
62 attrs = self._lowerattrs(attrs)
63 if attrs.get(u'type', u'') == u'text/css':
64 self.styles.append((attrs, u''))
65 self.curtag = tag
66 else:
67
68 self.curtag = u''
69
73
77
81
83 """
84 Retrieve all CSS stylesheets including embedded for a given URL.
85 Optional setting of User-Agent used for retrieval possible
86 to handle browser sniffing servers.
87
88 raises urllib2.HTTPError
89 """
90 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
91 """
92 initialize a new Capture object
93
94 ua
95 init User-Agent to use for requests
96 log
97 supply a log object which is used instead of the default
98 log which writes to sys.stderr
99 defaultloglevel
100 constant of logging package which defines the level of the
101 default log if no explicit log given
102 """
103 self._ua = ua
104 self._parser = CSSCaptureHTMLParser()
105
106 if log:
107 self._log = log
108 else:
109 self._log = logging.getLogger('CSSCapture')
110 hdlr = logging.StreamHandler(sys.stderr)
111 formatter = logging.Formatter('%(message)s')
112 hdlr.setFormatter(formatter)
113 self._log.addHandler(hdlr)
114 self._log.setLevel(defaultloglevel)
115 self._log.debug(u'(C) Using default log')
116
118 """
119 Does an HTTP request
120
121 Returns: (response, url)
122
123 url might have been changed by server due to redirects etc
124 """
125 self._log.debug(u' CSSCapture_doRequest URL: %s' % url)
126
127 req = urllib2.Request(url)
128 if self._ua:
129 req.add_header('User-agent', self._ua)
130 self._log.info(' Using User-Agent: %s', self._ua)
131 try:
132 res = urllib2.urlopen(req)
133 except urllib2.HTTPError, e:
134 self._log.critical(' %s\n%s %s\n%s' % (
135 e.geturl(), e.code, e.msg, e.headers))
136 return None, None
137
138
139 if url != res.geturl():
140 url = res.geturl()
141 self._log.info(' URL retrieved: %s', url)
142
143 return res, url
144
145 - def _doImports(self, parentStyleSheet, baseurl=None):
178
180 """
181 parse text for stylesheets
182 fills stylesheetlist with all found StyleSheets
183
184 docurl
185 to build a full url of found StyleSheets @href
186 doctext
187 to parse
188 """
189 self._parser.feed(doctext)
190
191
192
193 for link in self._parser.links:
194
195 href = urlparse.urljoin(docurl, link.get(u'href', u''))
196 media = stylesheets.MediaList(link.get(u'media', u''))
197 res, href = self._doRequest(href)
198 if not res:
199 continue
200 cssText = res.read()
201 sheet = css.CSSStyleSheet(
202 href=href,
203 media=media,
204 title=link.get(u'title', u''),
205 )
206 self.stylesheetlist.append(sheet)
207
208 self._log.info('\n--- FOUND <link>: %s ---', link)
209 self._log.info(' * full href : %s', href)
210 self._log.info(' * media : %s', media.mediaText)
211 self._log.info(' * stylesheet: %s\n' % sheet)
212 self._log.debug(' * cssText :\n%s\n', cssText)
213
214 try:
215 sheet.cssText = cssText
216 except xml.dom.DOMException, e:
217 self._log.warn('CSSParser message:\n%s\n' % e)
218 self._doImports(sheet, baseurl=docurl)
219
220
221
222
223 for style in self._parser.styles:
224
225 stylemeta, cssText = style
226 media = stylesheets.MediaList(stylemeta.get(u'media', u''))
227 sheet = css.CSSStyleSheet(
228 href=None,
229 media=media,
230 title=stylemeta.get(u'title', u''),
231 )
232 self.stylesheetlist.append(sheet)
233
234 self._log.info('\n--- FOUND <style>: %s ---', stylemeta)
235 self._log.info(' stylesheet : %s' % sheet)
236 self._log.info(' media : %s\n', media.mediaText)
237 self._log.debug(' cssText :\n%s\n', cssText)
238
239 try:
240 sheet.cssText = cssText
241 except xml.dom.DOMException, e:
242 self._log.warn('CSSParser message:\n%s\n' % e)
243 self._doImports(sheet, baseurl=docurl)
244
246 """
247 Capture stylesheets for the given url, any HTTPError is raised to
248 caller.
249
250 url
251 to capture CSS from
252 ua
253 User-Agent to use for requests
254
255 Returns StyleSheetList.
256 """
257 if ua is not None:
258 self._ua = ua
259
260
261 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
262 self._filename = os.path.basename(path)
263
264 self.stylesheetlist = stylesheets.StyleSheetList()
265
266 self._log.info('\nCapturing CSS from URL: %s\n', url)
267
268
269 res, url = self._doRequest(url)
270 if not res:
271 sys.exit(1)
272 rawdoc = res.read()
273
274 encoding = encutils.getEncodingInfo(
275 res, rawdoc, log=self._log).encoding
276 self._log.info('\nUsing Encoding: %s\n', encoding)
277
278 doctext = unicode(rawdoc, encoding)
279
280
281 self._findStyleSheets(url, doctext)
282
283 return self.stylesheetlist
284
285 - def saveto(self, dir, saveparsed=False):
286 """
287 saves css in "dir" in the same layout as on the server
288 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css"
289
290 dir
291 directory to save files to
292 saveparsed
293 use literal CSS from server or use the parsed version
294
295 you may want to use the server version until CSSParser is more
296 stable or if you want to keep the stylesheet exactly as is
297 """
298 inlines = 0
299 for sheet in self.stylesheetlist:
300
301 url = sheet.href
302 if not url:
303 url = '%s_INLINE_%s.css' % (
304 self._filename, inlines)
305 inlines += 1
306
307
308 cssutils.ser.prefs.keepAllProperties=True
309 cssText = sheet.cssText
310
311
312
313
314 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
315
316 if path and path.startswith('/'):
317 path = path[1:]
318 path = os.path.normpath(path)
319 path, fn = os.path.split(path)
320
321 savepath = os.path.join(dir, loc, path)
322 savefn = os.path.join(savepath, fn)
323
324 try:
325 os.makedirs(savepath)
326 except OSError, e:
327 if e.errno != errno.EEXIST:
328 raise e
329 self._log.debug('Path "%s" already exists.', savepath)
330
331 open(savefn, 'w').write(cssText)
332 self._log.info('Saving "%s"', savefn)
333
334 -def main(args=None):
335 import optparse
336
337 usage = "usage: %prog [options] URL"
338 parser = optparse.OptionParser(usage=usage)
339 parser.add_option('-u', '--useragent', action='store', dest='ua',
340 help='useragent to use for request of URL, default is urllib2s default')
341 parser.add_option('-s', '--saveto', action='store', dest='saveto',
342 help='saving retrieved files to "saveto", default to "_CSSCapture_SAVED"')
343 parser.add_option('-p', '--saveparsed', action='store_true', dest='saveparsed',
344 help='if given saves cssutils\' parsed files, otherwise original retrieved files')
345 parser.add_option('-n', '--notsave', action='store_true', dest='notsave',
346 help='if given files are NOT saved, only log is written')
347 parser.add_option('-d', '--debug', action='store_true', dest='debug',
348 help='show debug messages during capturing')
349 options, url = parser.parse_args()
350
351 if not url:
352 parser.error('no URL given')
353 else:
354 url = url[0]
355
356 if options.debug:
357 dll = logging.DEBUG
358 else:
359 dll = logging.INFO
360
361
362 c = CSSCapture(defaultloglevel=dll)
363
364 stylesheetlist = c.capture(url, ua=options.ua)
365
366 if options.notsave is None or not options.notsave:
367 if options.saveto:
368 saveto = options.saveto
369 else:
370 saveto = '_CSSCapture_SAVED'
371 c.saveto(saveto, saveparsed=options.saveparsed)
372 else:
373 for i, s in enumerate(stylesheetlist):
374 print i+1, '\tTitle: "%s", \n\thref: "%s"\n' % (s.title, s.href)
375
376
377 if __name__ == "__main__":
378 sys.exit(main())
379