1
2 """New CSS Tokenizer (a generator)
3
4 TODO: check selectors module tokenizer
5
6
7 test:
8 - r'\" \('
9 - r'\1 \22 \333 \4444 \55555 \666666 \777777 7 \7777777'
10 - r'#abc #123'
11
12 - longer tokens before shorter
13 1px -> 1
14
15 - escapes
16 c\olor is one token?
17 1p\\x = 1PX = 1px?
18
19 - num: -0 are two tokens?
20
21 """
22 __all__ = ['Tokenizer', 'CSSProductions']
23 __docformat__ = 'restructuredtext'
24 __author__ = '$LastChangedBy: cthedot $'
25 __date__ = '$LastChangedDate: 2007-09-01 15:55:42 +0200 (Sa, 01 Sep 2007) $'
26 __version__ = '$LastChangedRevision: 300 $'
27
28 import os
29 import re
30 import string
31 import xml.dom
32 import cssutils
33 import util
34 from cssproductions import *
35
37 """
38 generates a list of Token tuples:
39 (Tokenname, value, startline, startcolumn)
40 """
41 nl = os.linesep
42
43 - def __init__(self, macros=None, productions=None):
44 """
45 inits tokenizer with given macros and productions which default to
46 cssutils own macros and productions
47 """
48 self.log = cssutils.log
49 if not macros:
50 macros = MACROS
51 if not productions:
52 productions = PRODUCTIONS
53 self.tokenmatches = self._compile_productions(
54 self._expand_macros(macros, productions))
55 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0]
56 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
57
59 """returns macro expanded productions, order of productions is kept"""
60 def macro_value(m):
61 return '(?:%s)' % macros[m.groupdict()['macro']]
62 expanded = []
63 for key, value in productions:
64 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value):
65 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}',
66 macro_value, value)
67 expanded.append((key, value))
68 return expanded
69
71 """compile productions into callable match objects, order is kept"""
72 compiled = []
73 for key, value in expanded_productions:
74 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match))
75 return compiled
76
77 - def tokenize(self, text, linesep=None, fullsheet=False):
78 """
79 generator: tokenizes text and yiels tokens, each token is a tuple of
80 (tokenname, tokenvalue, line, col)
81
82 text
83 to be tokenized
84 linesep
85 used to detect the linenumber, defaults to os.linesep
86 fullsheet
87 if ``True`` appends EOF token as last one and completes incomplete
88 COMMENT tokens
89 """
90 if not linesep:
91 linesep = os.linesep
92 line = col = 1
93
94 tokens = []
95 while text:
96 for name, matcher in self.tokenmatches:
97
98 if fullsheet and name == 'CHAR' and text.startswith(u'/*'):
99
100
101 possiblecomment = u'%s*/' % text
102 match = self.commentmatcher(possiblecomment)
103 if match:
104 yield ('COMMENT', possiblecomment, line, col)
105 text = None
106 break
107
108
109 match = matcher(text)
110 if match:
111 found = match.group(0)
112
113 if fullsheet:
114
115 if 'INVALID' == name and text == found:
116
117 name = 'STRING'
118 found = '%s%s' % (found, found[0])
119
120 elif 'FUNCTION' == name and\
121 u'url(' == util.Base._normalize(found):
122
123
124 for end in (u"')", u'")', u')'):
125 possibleuri = '%s%s' % (text, end)
126 match = self.urimatcher(possibleuri)
127 if match:
128 name = 'URI'
129 found = match.group(0)
130 break
131
132 yield (name, found, line, col)
133 text = text[len(found):]
134 nls = found.count(linesep)
135 line += nls
136 if nls:
137 col = len(found[found.rfind(linesep):])
138 else:
139 col += len(found)
140 break
141
142 else:
143
144 raise xml.dom.SyntaxErr('no token match "%s(...)"' % text[:10])
145 text = text[1:]
146
147 if fullsheet:
148 yield ('EOF', u'', line, col)
149