1
2
3 """Python codec for CSS."""
4 __docformat__ = 'restructuredtext'
5 __docformat__ = 'restructuredtext'
6 __author__ = 'Walter Doerwald'
7 __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $'
8
9 import codecs, marshal
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
31 """
32 Detect the encoding of the byte string ``input``, which contains the
33 beginning of a CSS file. To detect the encoding the first few bytes are
34 used (or if ``input`` is ASCII compatible and starts with a charset rule
35 the encoding name from the rule).
36
37 If the encoding can't be detected yet, ``None`` is returned. ``final``
38 specifies whether more data is available in later calls or not. If ``final``
39 is true, ``_detectencoding_str()`` will never return ``None``.
40 """
41
42
43 CANDIDATE_UTF_8_SIG = 1
44 CANDIDATE_UTF_16_AS_LE = 2
45 CANDIDATE_UTF_16_AS_BE = 4
46 CANDIDATE_UTF_16_LE = 8
47 CANDIDATE_UTF_16_BE = 16
48 CANDIDATE_UTF_32_AS_LE = 32
49 CANDIDATE_UTF_32_AS_BE = 64
50 CANDIDATE_UTF_32_LE = 128
51 CANDIDATE_UTF_32_BE = 256
52 CANDIDATE_CHARSET = 512
53
54 candidates = 1023
55
56 li = len(input)
57 if li>=1:
58
59 c = input[0]
60 if c != "\xef":
61 candidates &= ~CANDIDATE_UTF_8_SIG
62 if c != "\xff":
63 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
64 if c != "\xfe":
65 candidates &= ~CANDIDATE_UTF_16_AS_BE
66 if c != "@":
67 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
68 if c != "\x00":
69 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
70 if li>=2:
71
72 c = input[1]
73 if c != "\xbb":
74 candidates &= ~CANDIDATE_UTF_8_SIG
75 if c != "\xfe":
76 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
77 if c != "\xff":
78 candidates &= ~CANDIDATE_UTF_16_AS_BE
79 if c != "\x00":
80 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
81 if c != "@":
82 candidates &= ~CANDIDATE_UTF_16_BE
83 if c != "c":
84 candidates &= ~CANDIDATE_CHARSET
85 if li>=3:
86
87 c = input[2]
88 if c != "\xbf":
89 candidates &= ~CANDIDATE_UTF_8_SIG
90 if c != "c":
91 candidates &= ~CANDIDATE_UTF_16_LE
92 if c != "\x00":
93 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
94 if c != "\xfe":
95 candidates &= ~CANDIDATE_UTF_32_AS_BE
96 if c != "h":
97 candidates &= ~CANDIDATE_CHARSET
98 if li>=4:
99
100 c = input[3]
101 if input[2:4] == "\x00\x00":
102 candidates &= ~CANDIDATE_UTF_16_AS_LE
103 if c != "\x00":
104 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
105 if c != "\xff":
106 candidates &= ~CANDIDATE_UTF_32_AS_BE
107 if c != "@":
108 candidates &= ~CANDIDATE_UTF_32_BE
109 if c != "a":
110 candidates &= ~CANDIDATE_CHARSET
111 if candidates == 0:
112 return "utf-8"
113 if not (candidates & (candidates-1)):
114 if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
115 return "utf-8-sig"
116 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
117 return "utf-16"
118 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
119 return "utf-16"
120 elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
121 return "utf-16-le"
122 elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
123 return "utf-16-be"
124 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
125 return "utf-32"
126 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
127 return "utf-32"
128 elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
129 return "utf-32-le"
130 elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
131 return "utf-32-be"
132 elif candidates == CANDIDATE_CHARSET and li >= 4:
133 prefix = '@charset "'
134 if input[:len(prefix)] == prefix:
135 pos = input.find('"', len(prefix))
136 if pos >= 0:
137 return input[len(prefix):pos]
138
139
140 if final:
141 return "utf-8"
142 return None
143
144
146 """
147 Detect the encoding of the unicode string ``input``, which contains the
148 beginning of a CSS file. The encoding is detected from the charset rule
149 at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
150 will be returned.
151
152 If the encoding can't be detected yet, ``None`` is returned. ``final``
153 specifies whether more data will be available in later calls or not. If
154 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``.
155 """
156 prefix = u'@charset "'
157 if input.startswith(prefix):
158 pos = input.find(u'"', len(prefix))
159 if pos >= 0:
160 return input[len(prefix):pos]
161 elif final or not prefix.startswith(input):
162
163
164 return "utf-8"
165 return None
166
167
169 """
170 Replace the name of the encoding in the charset rule at the beginning of
171 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset
172 rule, ``input`` will be returned unmodified.
173
174 If the encoding can't be found yet, ``None`` is returned. ``final``
175 specifies whether more data will be available in later calls or not.
176 If ``final`` is true, ``_fixencoding()`` will never return ``None``.
177 """
178 prefix = u'@charset "'
179 if len(input) > len(prefix):
180 if input.startswith(prefix):
181 pos = input.find(u'"', len(prefix))
182 if pos >= 0:
183 if encoding.replace("_", "-").lower() == "utf-8-sig":
184 encoding = u"utf-8"
185 return prefix + encoding + input[pos:]
186
187 else:
188 return input
189 elif not prefix.startswith(input) or final:
190
191 return input
192 if final:
193 return input
194 return None
195
196
197 -def decode(input, errors="strict", encoding=None):
204
205
206 -def encode(input, errors="strict", encoding=None):
218
219
221
222 i = 0
223 for byte in bytes:
224 i = (i<<8) + ord(byte)
225 return i
226
227
229
230 v = []
231 while i:
232 v.insert(0, chr(i&0xff))
233 i >>= 8
234 return "".join(v)
235
236
237 if hasattr(codecs, "IncrementalDecoder"):
239 - def __init__(self, errors="strict", encoding=None):
248
250 for part in input:
251 result = self.decode(part, False)
252 if result:
253 yield result
254 result = self.decode("", True)
255 if result:
256 yield result
257
258 - def decode(self, input, final=False):
259
260
261
262
263
264 if self.decoder is None:
265 input = self.buffer + input
266 self.encoding = _detectencoding_str(input, final)
267 if self.encoding is None:
268 self.buffer = input
269 return u""
270 if self.encoding == "css":
271 raise ValueError("css not allowed as encoding name")
272 self.buffer = ""
273 decoder = codecs.getincrementaldecoder(self.encoding)
274 self.decoder = decoder(self._errors)
275 if self.headerfixed:
276 return self.decoder.decode(input, final)
277
278
279 output = self.buffer + self.decoder.decode(input, final)
280 encoding = self.encoding
281 if encoding.replace("_", "-").lower() == "utf-8-sig":
282 encoding = "utf-8"
283 newoutput = _fixencoding(output, unicode(encoding), final)
284 if newoutput is None:
285
286 self.buffer = output
287 return u""
288 self.headerfixed = True
289 return newoutput
290
292 codecs.IncrementalDecoder.reset(self)
293 self.decoder = None
294 self.buffer = ""
295 self.headerfixed = False
296
299
301
302 if self.decoder is not None:
303 self.decoder.errors = errors
304 self._errors = errors
305 errors = property(_geterrors, _seterrors)
306
308 if self.decoder is not None:
309 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
310 else:
311 state = (self.encoding, self.buffer, self.headerfixed, False, None)
312 return ("", _bytes2int(marshal.dumps(state)))
313
315 state = _int2bytes(marshal.loads(state[1]))
316 self.encoding = state[0]
317 self.buffer = state[1]
318 self.headerfixed = state[2]
319 if state[3] is not None:
320 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
321 self.decoder.setstate(state[4])
322 else:
323 self.decoder = None
324
325
326 if hasattr(codecs, "IncrementalEncoder"):
328 - def __init__(self, errors="strict", encoding=None):
336
338 for part in input:
339 result = self.encode(part, False)
340 if result:
341 yield result
342 result = self.encode(u"", True)
343 if result:
344 yield result
345
346 - def encode(self, input, final=False):
347 if self.encoder is None:
348 input = self.buffer + input
349 if self.encoding is not None:
350
351 encoding = self.encoding
352 if encoding.replace("_", "-").lower() == "utf-8-sig":
353 encoding = "utf-8"
354 newinput = _fixencoding(input, unicode(encoding), final)
355 if newinput is None:
356 self.buffer = input
357 return ""
358 input = newinput
359 else:
360
361 self.encoding = _detectencoding_unicode(input, final)
362 if self.encoding is not None:
363 if self.encoding == "css":
364 raise ValueError("css not allowed as encoding name")
365 info = codecs.lookup(self.encoding)
366 encoding = self.encoding
367 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
368 input = _fixencoding(input, u"utf-8", True)
369 self.encoder = info.incrementalencoder(self._errors)
370 self.buffer = u""
371 else:
372 self.buffer = input
373 return ""
374 return self.encoder.encode(input, final)
375
380
383
385
386 if self.encoder is not None:
387 self.encoder.errors = errors
388 self._errors = errors
389 errors = property(_geterrors, _seterrors)
390
392 if self.encoder is not None:
393 state = (self.encoding, self.buffer, True, self.encoder.getstate())
394 else:
395 state = (self.encoding, self.buffer, False, None)
396 return _bytes2int(marshal.dumps(state))
397
399 state = _int2bytes(marshal.loads(state))
400 self.encoding = state[0]
401 self.buffer = state[1]
402 if state[2] is not None:
403 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
404 self.encoder.setstate(state[4])
405 else:
406 self.encoder = None
407
408
410 - def __init__(self, stream, errors="strict", encoding=None, header=False):
416
417 - def encode(self, input, errors='strict'):
418 li = len(input)
419 if self.streamwriter is None:
420 input = self.buffer + input
421 li = len(input)
422 if self.encoding is not None:
423
424 encoding = self.encoding
425 if encoding.replace("_", "-").lower() == "utf-8-sig":
426 encoding = "utf-8"
427 newinput = _fixencoding(input, unicode(encoding), False)
428 if newinput is None:
429 self.buffer = input
430 return ("", 0)
431 input = newinput
432 else:
433
434 self.encoding = _detectencoding_unicode(input, False)
435 if self.encoding is not None:
436 if self.encoding == "css":
437 raise ValueError("css not allowed as encoding name")
438 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
439 encoding = self.encoding
440 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
441 input = _fixencoding(input, u"utf-8", True)
442 self.buffer = u""
443 else:
444 self.buffer = input
445 return ("", 0)
446 return (self.streamwriter.encode(input, errors)[0], li)
447
450
452
453 if self.streamwriter is not None:
454 self.streamwriter.errors = errors
455 self._errors = errors
456 errors = property(_geterrors, _seterrors)
457
458
460 - def __init__(self, stream, errors="strict", encoding=None):
465
466 - def decode(self, input, errors='strict'):
467 if self.streamreader is None:
468 self.encoding = _detectencoding_str(input, False)
469 if self.encoding is None:
470 return (u"", 0)
471 if self.encoding == "css":
472 raise ValueError("css not allowed as encoding name")
473 streamreader = codecs.getreader(self.encoding)
474 streamreader = streamreader(self.stream, self._errors)
475 (output, consumed) = streamreader.decode(input, errors)
476 encoding = self.encoding
477 if encoding.replace("_", "-").lower() == "utf-8-sig":
478 encoding = "utf-8"
479 newoutput = _fixencoding(output, unicode(encoding), False)
480 if newoutput is not None:
481 self.streamreader = streamreader
482 return (newoutput, consumed)
483 return (u"", 0)
484 return self.streamreader.decode(input, errors)
485
488
490
491 if self.streamreader is not None:
492 self.streamreader.errors = errors
493 self._errors = errors
494 errors = property(_geterrors, _seterrors)
495
496
497 if hasattr(codecs, "CodecInfo"):
498
510 else:
511
513 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
514
516 prefix = 0
517 if input[:3] == codecs.BOM_UTF8:
518 input = input[3:]
519 prefix = 3
520 (output, consumed) = codecs.utf_8_decode(input, errors, True)
521 return (output, consumed+prefix)
522
530
531 - def encode(self, input, errors='strict'):
534
542
543 - def decode(self, input, errors='strict'):
544 if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
545
546
547 return (u"", 0)
548 self.decode = codecs.utf_8_decode
549 return utf8sig_decode(input, errors)
550
558
559
560 codecs.register(search_function)
561
562
563
564
566 if not isinstance(exc, UnicodeEncodeError):
567 raise TypeError("don't know how to handle %r" % exc)
568 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
569
570 codecs.register_error("cssescape", cssescape)
571