Package cssutils :: Module tokenize2
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize2

  1  #!/usr/bin/env python 
  2  """New CSS Tokenizer (a generator) 
  3   
  4  TODO: check selectors module tokenizer 
  5   
  6   
  7  test: 
  8      - r'\" \(' 
  9      - r'\1 \22 \333 \4444 \55555 \666666 \777777 7 \7777777' 
 10      - r'#abc #123' 
 11   
 12      - longer tokens before shorter 
 13          1px -> 1 
 14   
 15      - escapes 
 16          c\olor is one token? 
 17          1p\\x = 1PX = 1px? 
 18   
 19      - num: -0 are two tokens? 
 20   
 21  """ 
 22  __all__ = ['Tokenizer', 'CSSProductions'] 
 23  __docformat__ = 'restructuredtext' 
 24  __author__ = '$LastChangedBy: cthedot $' 
 25  __date__ = '$LastChangedDate: 2007-09-01 15:55:42 +0200 (Sa, 01 Sep 2007) $' 
 26  __version__ = '$LastChangedRevision: 300 $' 
 27   
 28  import os 
 29  import re 
 30  import string 
 31  import xml.dom 
 32  import cssutils 
 33  import util 
 34  from cssproductions import * 
 35   
36 -class Tokenizer(object):
37 """ 38 generates a list of Token tuples: 39 (Tokenname, value, startline, startcolumn) 40 """ 41 nl = os.linesep 42
43 - def __init__(self, macros=None, productions=None):
44 """ 45 inits tokenizer with given macros and productions which default to 46 cssutils own macros and productions 47 """ 48 self.log = cssutils.log 49 if not macros: 50 macros = MACROS 51 if not productions: 52 productions = PRODUCTIONS 53 self.tokenmatches = self._compile_productions( 54 self._expand_macros(macros, productions)) 55 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0] 56 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
57
58 - def _expand_macros(self, macros, productions):
59 """returns macro expanded productions, order of productions is kept""" 60 def macro_value(m): 61 return '(?:%s)' % macros[m.groupdict()['macro']]
62 expanded = [] 63 for key, value in productions: 64 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value): 65 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}', 66 macro_value, value) 67 expanded.append((key, value)) 68 return expanded
69
70 - def _compile_productions(self, expanded_productions):
71 """compile productions into callable match objects, order is kept""" 72 compiled = [] 73 for key, value in expanded_productions: 74 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match)) 75 return compiled
76
77 - def tokenize(self, text, linesep=None, fullsheet=False):
78 """ 79 generator: tokenizes text and yiels tokens, each token is a tuple of 80 (tokenname, tokenvalue, line, col) 81 82 text 83 to be tokenized 84 linesep 85 used to detect the linenumber, defaults to os.linesep 86 fullsheet 87 if ``True`` appends EOF token as last one and completes incomplete 88 COMMENT tokens 89 """ 90 if not linesep: 91 linesep = os.linesep 92 line = col = 1 93 94 tokens = [] 95 while text: 96 for name, matcher in self.tokenmatches: 97 98 if fullsheet and name == 'CHAR' and text.startswith(u'/*'): 99 # after all tokens except CHAR have been tested 100 # test for incomplete comment 101 possiblecomment = u'%s*/' % text 102 match = self.commentmatcher(possiblecomment) 103 if match: 104 yield ('COMMENT', possiblecomment, line, col) 105 text = None 106 break 107 108 # default 109 match = matcher(text) 110 if match: 111 found = match.group(0) 112 113 if fullsheet: 114 # check if tokens may be completed 115 if 'INVALID' == name and text == found: 116 # complete INVALID to STRING 117 name = 'STRING' 118 found = '%s%s' % (found, found[0]) 119 120 elif 'FUNCTION' == name and\ 121 u'url(' == util.Base._normalize(found): 122 # FUNCTION url( is fixed to URI if fullsheet 123 # FUNCTION production MUST BE after URI production! 124 for end in (u"')", u'")', u')'): 125 possibleuri = '%s%s' % (text, end) 126 match = self.urimatcher(possibleuri) 127 if match: 128 name = 'URI' 129 found = match.group(0) 130 break 131 132 yield (name, found, line, col) 133 text = text[len(found):] 134 nls = found.count(linesep) 135 line += nls 136 if nls: 137 col = len(found[found.rfind(linesep):]) 138 else: 139 col += len(found) 140 break 141 142 else: 143 # should not happen at all 144 raise xml.dom.SyntaxErr('no token match "%s(...)"' % text[:10]) 145 text = text[1:] 146 147 if fullsheet: 148 yield ('EOF', u'', line, col)
149