1
2
3 """Python codec for CSS."""
4 __docformat__ = 'restructuredtext'
5 __author__ = 'Walter Doerwald'
6 __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $'
7
8 import codecs, marshal
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
30 """
31 Detect the encoding of the byte string ``input``, which contains the
32 beginning of a CSS file. To detect the encoding the first few bytes are
33 used (or if ``input`` is ASCII compatible and starts with a charset rule
34 the encoding name from the rule).
35
36 If the encoding can't be detected yet, ``None`` is returned. ``final``
37 specifies whether more data is available in later calls or not. If ``final``
38 is true, ``_detectencoding_str()`` will never return ``None``.
39 """
40
41
42 CANDIDATE_UTF_8_SIG = 1
43 CANDIDATE_UTF_16_AS_LE = 2
44 CANDIDATE_UTF_16_AS_BE = 4
45 CANDIDATE_UTF_16_LE = 8
46 CANDIDATE_UTF_16_BE = 16
47 CANDIDATE_UTF_32_AS_LE = 32
48 CANDIDATE_UTF_32_AS_BE = 64
49 CANDIDATE_UTF_32_LE = 128
50 CANDIDATE_UTF_32_BE = 256
51 CANDIDATE_CHARSET = 512
52
53 candidates = 1023
54
55 li = len(input)
56 if li>=1:
57
58 c = input[0]
59 if c != "\xef":
60 candidates &= ~CANDIDATE_UTF_8_SIG
61 if c != "\xff":
62 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
63 if c != "\xfe":
64 candidates &= ~CANDIDATE_UTF_16_AS_BE
65 if c != "@":
66 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
67 if c != "\x00":
68 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
69 if li>=2:
70
71 c = input[1]
72 if c != "\xbb":
73 candidates &= ~CANDIDATE_UTF_8_SIG
74 if c != "\xfe":
75 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
76 if c != "\xff":
77 candidates &= ~CANDIDATE_UTF_16_AS_BE
78 if c != "\x00":
79 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
80 if c != "@":
81 candidates &= ~CANDIDATE_UTF_16_BE
82 if c != "c":
83 candidates &= ~CANDIDATE_CHARSET
84 if li>=3:
85
86 c = input[2]
87 if c != "\xbf":
88 candidates &= ~CANDIDATE_UTF_8_SIG
89 if c != "c":
90 candidates &= ~CANDIDATE_UTF_16_LE
91 if c != "\x00":
92 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
93 if c != "\xfe":
94 candidates &= ~CANDIDATE_UTF_32_AS_BE
95 if c != "h":
96 candidates &= ~CANDIDATE_CHARSET
97 if li>=4:
98
99 c = input[3]
100 if input[2:4] == "\x00\x00":
101 candidates &= ~CANDIDATE_UTF_16_AS_LE
102 if c != "\x00":
103 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
104 if c != "\xff":
105 candidates &= ~CANDIDATE_UTF_32_AS_BE
106 if c != "@":
107 candidates &= ~CANDIDATE_UTF_32_BE
108 if c != "a":
109 candidates &= ~CANDIDATE_CHARSET
110 if candidates == 0:
111 return "utf-8"
112 if not (candidates & (candidates-1)):
113 if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
114 return "utf-8-sig"
115 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
116 return "utf-16"
117 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
118 return "utf-16"
119 elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
120 return "utf-16-le"
121 elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
122 return "utf-16-be"
123 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
124 return "utf-32"
125 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
126 return "utf-32"
127 elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
128 return "utf-32-le"
129 elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
130 return "utf-32-be"
131 elif candidates == CANDIDATE_CHARSET and li >= 4:
132 prefix = '@charset "'
133 if input[:len(prefix)] == prefix:
134 pos = input.find('"', len(prefix))
135 if pos >= 0:
136 return input[len(prefix):pos]
137
138
139 if final:
140 return "utf-8"
141 return None
142
143
145 """
146 Detect the encoding of the unicode string ``input``, which contains the
147 beginning of a CSS file. The encoding is detected from the charset rule
148 at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
149 will be returned.
150
151 If the encoding can't be detected yet, ``None`` is returned. ``final``
152 specifies whether more data will be available in later calls or not. If
153 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``.
154 """
155 prefix = u'@charset "'
156 if input.startswith(prefix):
157 pos = input.find(u'"', len(prefix))
158 if pos >= 0:
159 return input[len(prefix):pos]
160 elif final or not prefix.startswith(input):
161
162
163 return "utf-8"
164 return None
165
166
168 """
169 Replace the name of the encoding in the charset rule at the beginning of
170 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset
171 rule, ``input`` will be returned unmodified.
172
173 If the encoding can't be found yet, ``None`` is returned. ``final``
174 specifies whether more data will be available in later calls or not.
175 If ``final`` is true, ``_fixencoding()`` will never return ``None``.
176 """
177 prefix = u'@charset "'
178 if len(input) > len(prefix):
179 if input.startswith(prefix):
180 pos = input.find(u'"', len(prefix))
181 if pos >= 0:
182 if encoding.replace("_", "-").lower() == "utf-8-sig":
183 encoding = u"utf-8"
184 return prefix + encoding + input[pos:]
185
186 else:
187 return input
188 elif not prefix.startswith(input) or final:
189
190 return input
191 if final:
192 return input
193 return None
194
195
196 -def decode(input, errors="strict", encoding=None):
203
204
205 -def encode(input, errors="strict", encoding=None):
217
218
220
221 i = 0
222 for byte in bytes:
223 i = (i<<8) + ord(byte)
224 return i
225
226
228
229 v = []
230 while i:
231 v.insert(0, chr(i&0xff))
232 i >>= 8
233 return "".join(v)
234
235
236 if hasattr(codecs, "IncrementalDecoder"):
238 - def __init__(self, errors="strict", encoding=None):
247
249 for part in input:
250 result = self.decode(part, False)
251 if result:
252 yield result
253 result = self.decode("", True)
254 if result:
255 yield result
256
257 - def decode(self, input, final=False):
258
259
260
261
262
263 if self.decoder is None:
264 input = self.buffer + input
265 self.encoding = _detectencoding_str(input, final)
266 if self.encoding is None:
267 self.buffer = input
268 return u""
269 if self.encoding == "css":
270 raise ValueError("css not allowed as encoding name")
271 self.buffer = ""
272 decoder = codecs.getincrementaldecoder(self.encoding)
273 self.decoder = decoder(self._errors)
274 if self.headerfixed:
275 return self.decoder.decode(input, final)
276
277
278 output = self.buffer + self.decoder.decode(input, final)
279 encoding = self.encoding
280 if encoding.replace("_", "-").lower() == "utf-8-sig":
281 encoding = "utf-8"
282 newoutput = _fixencoding(output, unicode(encoding), final)
283 if newoutput is None:
284
285 self.buffer = output
286 return u""
287 self.headerfixed = True
288 return newoutput
289
291 codecs.IncrementalDecoder.reset(self)
292 self.decoder = None
293 self.buffer = ""
294 self.headerfixed = False
295
298
300
301 if self.decoder is not None:
302 self.decoder.errors = errors
303 self._errors = errors
304 errors = property(_geterrors, _seterrors)
305
307 if self.decoder is not None:
308 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
309 else:
310 state = (self.encoding, self.buffer, self.headerfixed, False, None)
311 return ("", _bytes2int(marshal.dumps(state)))
312
314 state = _int2bytes(marshal.loads(state[1]))
315 self.encoding = state[0]
316 self.buffer = state[1]
317 self.headerfixed = state[2]
318 if state[3] is not None:
319 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
320 self.decoder.setstate(state[4])
321 else:
322 self.decoder = None
323
324
325 if hasattr(codecs, "IncrementalEncoder"):
327 - def __init__(self, errors="strict", encoding=None):
335
337 for part in input:
338 result = self.encode(part, False)
339 if result:
340 yield result
341 result = self.encode(u"", True)
342 if result:
343 yield result
344
345 - def encode(self, input, final=False):
346 if self.encoder is None:
347 input = self.buffer + input
348 if self.encoding is not None:
349
350 encoding = self.encoding
351 if encoding.replace("_", "-").lower() == "utf-8-sig":
352 encoding = "utf-8"
353 newinput = _fixencoding(input, unicode(encoding), final)
354 if newinput is None:
355 self.buffer = input
356 return ""
357 input = newinput
358 else:
359
360 self.encoding = _detectencoding_unicode(input, final)
361 if self.encoding is not None:
362 if self.encoding == "css":
363 raise ValueError("css not allowed as encoding name")
364 info = codecs.lookup(self.encoding)
365 encoding = self.encoding
366 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
367 input = _fixencoding(input, u"utf-8", True)
368 self.encoder = info.incrementalencoder(self._errors)
369 self.buffer = u""
370 else:
371 self.buffer = input
372 return ""
373 return self.encoder.encode(input, final)
374
379
382
384
385 if self.encoder is not None:
386 self.encoder.errors = errors
387 self._errors = errors
388 errors = property(_geterrors, _seterrors)
389
391 if self.encoder is not None:
392 state = (self.encoding, self.buffer, True, self.encoder.getstate())
393 else:
394 state = (self.encoding, self.buffer, False, None)
395 return _bytes2int(marshal.dumps(state))
396
398 state = _int2bytes(marshal.loads(state))
399 self.encoding = state[0]
400 self.buffer = state[1]
401 if state[2] is not None:
402 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
403 self.encoder.setstate(state[4])
404 else:
405 self.encoder = None
406
407
409 - def __init__(self, stream, errors="strict", encoding=None, header=False):
415
416 - def encode(self, input, errors='strict'):
417 li = len(input)
418 if self.streamwriter is None:
419 input = self.buffer + input
420 li = len(input)
421 if self.encoding is not None:
422
423 encoding = self.encoding
424 if encoding.replace("_", "-").lower() == "utf-8-sig":
425 encoding = "utf-8"
426 newinput = _fixencoding(input, unicode(encoding), False)
427 if newinput is None:
428 self.buffer = input
429 return ("", 0)
430 input = newinput
431 else:
432
433 self.encoding = _detectencoding_unicode(input, False)
434 if self.encoding is not None:
435 if self.encoding == "css":
436 raise ValueError("css not allowed as encoding name")
437 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
438 encoding = self.encoding
439 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
440 input = _fixencoding(input, u"utf-8", True)
441 self.buffer = u""
442 else:
443 self.buffer = input
444 return ("", 0)
445 return (self.streamwriter.encode(input, errors)[0], li)
446
449
451
452 if self.streamwriter is not None:
453 self.streamwriter.errors = errors
454 self._errors = errors
455 errors = property(_geterrors, _seterrors)
456
457
459 - def __init__(self, stream, errors="strict", encoding=None):
464
465 - def decode(self, input, errors='strict'):
466 if self.streamreader is None:
467 if self.encoding is None:
468 self.encoding = _detectencoding_str(input, False)
469 if self.encoding is None:
470 return (u"", 0)
471 if self.encoding == "css":
472 raise ValueError("css not allowed as encoding name")
473 streamreader = codecs.getreader(self.encoding)
474 streamreader = streamreader(self.stream, self._errors)
475 (output, consumed) = streamreader.decode(input, errors)
476 encoding = self.encoding
477 if encoding.replace("_", "-").lower() == "utf-8-sig":
478 encoding = "utf-8"
479 newoutput = _fixencoding(output, unicode(encoding), False)
480 if newoutput is not None:
481 self.streamreader = streamreader
482 return (newoutput, consumed)
483 return (u"", 0)
484 return self.streamreader.decode(input, errors)
485
488
490
491 if self.streamreader is not None:
492 self.streamreader.errors = errors
493 self._errors = errors
494 errors = property(_geterrors, _seterrors)
495
496
497 if hasattr(codecs, "CodecInfo"):
498
510 else:
511
513 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
514
516 prefix = 0
517 if input[:3] == codecs.BOM_UTF8:
518 input = input[3:]
519 prefix = 3
520 (output, consumed) = codecs.utf_8_decode(input, errors, True)
521 return (output, consumed+prefix)
522
530
531 - def encode(self, input, errors='strict'):
534
542
543 - def decode(self, input, errors='strict'):
544 if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
545
546
547 return (u"", 0)
548 self.decode = codecs.utf_8_decode
549 return utf8sig_decode(input, errors)
550
558
559
560 codecs.register(search_function)
561
562
563
564
566 if not isinstance(exc, UnicodeEncodeError):
567 raise TypeError("don't know how to handle %r" % exc)
568 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
569
570 codecs.register_error("cssescape", cssescape)
571