Package cssutils :: Package scripts :: Module csscapture
[hide private]
[frames] | no frames]

Source Code for Module cssutils.scripts.csscapture

  1  #!/usr/bin/env python 
  2  """ 
  3  Retrieve all CSS stylesheets including embedded for a given URL. 
  4  Retrieve as StyleSheetList or save to disk. 
  5   
  6  TODO: 
  7      @import 
  8      save all 
  9   
 10      maybe use DOM 3 load/save? 
 11   
 12      logger class which handles all cases when no log is given... 
 13   
 14      saveto: 
 15          why does urllib2 hang? 
 16  """ 
 17  __all__ = ['CSSCapture'] 
 18  __docformat__ = 'restructuredtext' 
 19  __author__ = '$LastChangedBy: cthedot $' 
 20  __date__ = '$LastChangedDate: 2007-10-27 22:48:36 +0200 (Sa, 27 Okt 2007) $' 
 21  __version__ = '$LastChangedRevision: 594 $' 
 22   
 23  import errno 
 24  import HTMLParser 
 25  import logging 
 26  import os 
 27  import sys 
 28  import urllib 
 29  import urllib2 
 30  import urlparse 
 31  import xml.dom 
 32   
 33  import cssutils 
 34  from cssutils import css, stylesheets 
 35   
 36  try: 
 37      import encutils 
 38  except ImportError: 
 39      try: 
 40          import cssutils.encutils as encutils 
 41      except ImportError: 
 42          sys.exit("You need encutils from http://cthedot.de/encutils/") 
 43   
44 -class CSSCaptureHTMLParser(HTMLParser.HTMLParser):
45 """ parses given data for link and style elements """ 46 curtag = u'' 47 links = [] 48 # list of attrsdict 49 styles = [] 50 # list of (attrsdict, data) 51
52 - def _lowerattrs(self, attrs):
53 return dict([(a.lower(), v.lower()) for a, v in attrs])
54
55 - def handle_starttag(self, tag, attrs):
56 if tag == u'link': 57 attrs = self._lowerattrs(attrs) 58 if attrs.get(u'type', u'') == u'text/css': 59 self.links.append(attrs) 60 # also get content of tag 61 elif tag == u'style': 62 attrs = self._lowerattrs(attrs) 63 if attrs.get(u'type', u'') == u'text/css': 64 self.styles.append((attrs, u'')) 65 self.curtag = tag 66 else: 67 # close as style cannot contain any elements 68 self.curtag = u''
69
70 - def handle_data(self, data):
71 if self.curtag == u'style': 72 self.styles[-1] = (self.styles[-1][0], data)
73
74 - def handle_comment(self, data):
75 # style might have comment content, treat same as data 76 self.handle_data(data)
77
78 - def handle_endtag(self, tag):
79 # close as style cannot contain any elements 80 self.curtag = u''
81
82 -class CSSCapture(object):
83 """ 84 Retrieve all CSS stylesheets including embedded for a given URL. 85 Optional setting of User-Agent used for retrieval possible 86 to handle browser sniffing servers. 87 88 raises urllib2.HTTPError 89 """
90 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
91 """ 92 initialize a new Capture object 93 94 ua 95 init User-Agent to use for requests 96 log 97 supply a log object which is used instead of the default 98 log which writes to sys.stderr 99 defaultloglevel 100 constant of logging package which defines the level of the 101 default log if no explicit log given 102 """ 103 self._ua = ua 104 self._parser = CSSCaptureHTMLParser() 105 106 if log: 107 self._log = log 108 else: 109 self._log = logging.getLogger('CSSCapture') 110 hdlr = logging.StreamHandler(sys.stderr) 111 formatter = logging.Formatter('%(message)s') 112 hdlr.setFormatter(formatter) 113 self._log.addHandler(hdlr) 114 self._log.setLevel(defaultloglevel) 115 self._log.debug(u'(C) Using default log')
116
117 - def _doRequest(self, url):
118 """ 119 Does an HTTP request 120 121 Returns: (response, url) 122 123 url might have been changed by server due to redirects etc 124 """ 125 self._log.debug(u' CSSCapture_doRequest URL: %s' % url) 126 127 req = urllib2.Request(url) 128 if self._ua: 129 req.add_header('User-agent', self._ua) 130 self._log.info(' Using User-Agent: %s', self._ua) 131 try: 132 res = urllib2.urlopen(req) 133 except urllib2.HTTPError, e: 134 self._log.critical(' %s\n%s %s\n%s' % ( 135 e.geturl(), e.code, e.msg, e.headers)) 136 return None, None 137 138 # get real url 139 if url != res.geturl(): 140 url = res.geturl() 141 self._log.info(' URL retrieved: %s', url) 142 143 return res, url
144
145 - def _doImports(self, parentStyleSheet, baseurl=None):
146 """ 147 handle all @import CSS stylesheet recusively 148 found CSS stylesheets are appended to stylesheetlist 149 """ 150 for rule in parentStyleSheet.cssRules: 151 if rule.type == css.CSSRule.IMPORT_RULE: 152 153 href = urlparse.urljoin(baseurl, rule.href) 154 media = rule.media 155 res, href = self._doRequest(href) 156 if not res: 157 continue 158 cssText = res.read() 159 sheet = css.CSSStyleSheet( 160 href=href, 161 media=media, 162 parentStyleSheet=parentStyleSheet 163 ) 164 self.stylesheetlist.append(sheet) 165 166 self._log.info( 167 '\n--- FOUND @import in: %s ---' % parentStyleSheet) 168 self._log.info(' * full href : %s', href) 169 self._log.info(' * media : %s', media.mediaText) 170 self._log.info(' * stylesheet : %s\n' % sheet) 171 self._log.debug(' * cssText :\n%s\n', cssText) 172 173 try: 174 sheet.cssText = cssText 175 except xml.dom.DOMException, e: 176 self._log.warn('CSSParser message:\n%s\n' % e) 177 self._doImports(sheet, baseurl=href)
178
179 - def _findStyleSheets(self, docurl, doctext):
180 """ 181 parse text for stylesheets 182 fills stylesheetlist with all found StyleSheets 183 184 docurl 185 to build a full url of found StyleSheets @href 186 doctext 187 to parse 188 """ 189 self._parser.feed(doctext) 190 191 # <link>ed stylesheets 192 # ownerNode should be set to the <link> node 193 for link in self._parser.links: 194 195 href = urlparse.urljoin(docurl, link.get(u'href', u'')) 196 media = stylesheets.MediaList(link.get(u'media', u'')) 197 res, href = self._doRequest(href) 198 if not res: 199 continue 200 cssText = res.read() 201 sheet = css.CSSStyleSheet( 202 href=href, 203 media=media, 204 title=link.get(u'title', u''), 205 ) 206 self.stylesheetlist.append(sheet) 207 208 self._log.info('\n--- FOUND <link>: %s ---', link) 209 self._log.info(' * full href : %s', href) 210 self._log.info(' * media : %s', media.mediaText) 211 self._log.info(' * stylesheet: %s\n' % sheet) 212 self._log.debug(' * cssText :\n%s\n', cssText) 213 214 try: 215 sheet.cssText = cssText 216 except xml.dom.DOMException, e: 217 self._log.warn('CSSParser message:\n%s\n' % e) 218 self._doImports(sheet, baseurl=docurl) 219 220 # internal <style>sheets 221 # href is None for internal stylesheets 222 # ownerNode should be set to the <style> node 223 for style in self._parser.styles: 224 225 stylemeta, cssText = style 226 media = stylesheets.MediaList(stylemeta.get(u'media', u'')) 227 sheet = css.CSSStyleSheet( 228 href=None, 229 media=media, 230 title=stylemeta.get(u'title', u''), 231 ) 232 self.stylesheetlist.append(sheet) 233 234 self._log.info('\n--- FOUND <style>: %s ---', stylemeta) 235 self._log.info(' stylesheet : %s' % sheet) 236 self._log.info(' media : %s\n', media.mediaText) 237 self._log.debug(' cssText :\n%s\n', cssText) 238 239 try: 240 sheet.cssText = cssText 241 except xml.dom.DOMException, e: 242 self._log.warn('CSSParser message:\n%s\n' % e) 243 self._doImports(sheet, baseurl=docurl)
244
245 - def capture(self, url, ua=None):
246 """ 247 Capture stylesheets for the given url, any HTTPError is raised to 248 caller. 249 250 url 251 to capture CSS from 252 ua 253 User-Agent to use for requests 254 255 Returns StyleSheetList. 256 """ 257 if ua is not None: 258 self._ua = ua 259 260 # used to save inline styles 261 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 262 self._filename = os.path.basename(path) 263 264 self.stylesheetlist = stylesheets.StyleSheetList() 265 266 self._log.info('\nCapturing CSS from URL: %s\n', url) 267 268 # get url content 269 res, url = self._doRequest(url) 270 if not res: 271 sys.exit(1) 272 rawdoc = res.read() 273 274 encoding = encutils.getEncodingInfo( 275 res, rawdoc, log=self._log).encoding 276 self._log.info('\nUsing Encoding: %s\n', encoding) 277 278 doctext = unicode(rawdoc, encoding) 279 280 # fill list of stylesheets 281 self._findStyleSheets(url, doctext) 282 283 return self.stylesheetlist
284
285 - def saveto(self, dir, saveparsed=False):
286 """ 287 saves css in "dir" in the same layout as on the server 288 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css" 289 290 dir 291 directory to save files to 292 saveparsed 293 use literal CSS from server or use the parsed version 294 295 you may want to use the server version until CSSParser is more 296 stable or if you want to keep the stylesheet exactly as is 297 """ 298 inlines = 0 299 for sheet in self.stylesheetlist: 300 301 url = sheet.href 302 if not url: 303 url = '%s_INLINE_%s.css' % ( 304 self._filename, inlines) 305 inlines += 1 306 307 #if saveparsed: 308 cssutils.ser.prefs.keepAllProperties=True 309 cssText = sheet.cssText 310 #else: 311 # cssText = sheet.literalCssText 312 313 # build savepath 314 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 315 # no absolute path 316 if path and path.startswith('/'): 317 path = path[1:] 318 path = os.path.normpath(path) 319 path, fn = os.path.split(path) 320 321 savepath = os.path.join(dir, loc, path) 322 savefn = os.path.join(savepath, fn) 323 324 try: 325 os.makedirs(savepath) 326 except OSError, e: 327 if e.errno != errno.EEXIST: 328 raise e 329 self._log.debug('Path "%s" already exists.', savepath) 330 331 open(savefn, 'w').write(cssText) 332 self._log.info('Saving "%s"', savefn)
333
334 -def main(args=None):
335 import optparse 336 337 usage = "usage: %prog [options] URL" 338 parser = optparse.OptionParser(usage=usage) 339 parser.add_option('-u', '--useragent', action='store', dest='ua', 340 help='useragent to use for request of URL, default is urllib2s default') 341 parser.add_option('-s', '--saveto', action='store', dest='saveto', 342 help='saving retrieved files to "saveto", default to "_CSSCapture_SAVED"') 343 parser.add_option('-p', '--saveparsed', action='store_true', dest='saveparsed', 344 help='if given saves cssutils\' parsed files, otherwise original retrieved files') 345 parser.add_option('-n', '--notsave', action='store_true', dest='notsave', 346 help='if given files are NOT saved, only log is written') 347 parser.add_option('-d', '--debug', action='store_true', dest='debug', 348 help='show debug messages during capturing') 349 options, url = parser.parse_args() 350 351 if not url: 352 parser.error('no URL given') 353 else: 354 url = url[0] 355 356 if options.debug: 357 dll = logging.DEBUG 358 else: 359 dll = logging.INFO 360 361 # START 362 c = CSSCapture(defaultloglevel=dll) 363 364 stylesheetlist = c.capture(url, ua=options.ua) 365 366 if options.notsave is None or not options.notsave: 367 if options.saveto: 368 saveto = options.saveto 369 else: 370 saveto = '_CSSCapture_SAVED' 371 c.saveto(saveto, saveparsed=options.saveparsed) 372 else: 373 for i, s in enumerate(stylesheetlist): 374 print i+1, '\tTitle: "%s", \n\thref: "%s"\n' % (s.title, s.href)
375 376 377 if __name__ == "__main__": 378 sys.exit(main()) 379