Package cssutils :: Module codec
[hide private]
[frames] | no frames]

Source Code for Module cssutils.codec

  1  #!/usr/bin/env python 
  2   
  3  """Python codec for CSS.""" 
  4  __docformat__ = 'restructuredtext' 
  5  __author__ = 'Walter Doerwald' 
  6  __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $' 
  7   
  8  import codecs, marshal 
  9   
 10   
 11  # We're using bits to store all possible candidate encodings (or variants, i.e. 
 12  # we have two bits for the variants of UTF-16 and two for the 
 13  # variants of UTF-32). 
 14  # 
 15  # Prefixes for various CSS encodings 
 16  # UTF-8-SIG   xEF  xBB  xBF 
 17  # UTF-16 (LE) xFF  xFE ~x00|~x00 
 18  # UTF-16 (BE) xFE  xFF 
 19  # UTF-16-LE    @   x00   @   x00 
 20  # UTF-16-BE   x00   @ 
 21  # UTF-32 (LE) xFF  xFE  x00  x00 
 22  # UTF-32 (BE) x00  x00  xFE  xFF 
 23  # UTF-32-LE    @   x00  x00  x00 
 24  # UTF-32-BE   x00  x00  x00   @ 
 25  # CHARSET      @    c    h    a  ... 
 26   
 27   
 28   
29 -def _detectencoding_str(input, final=False):
30 """ 31 Detect the encoding of the byte string ``input``, which contains the 32 beginning of a CSS file. To detect the encoding the first few bytes are 33 used (or if ``input`` is ASCII compatible and starts with a charset rule 34 the encoding name from the rule). 35 36 If the encoding can't be detected yet, ``None`` is returned. ``final`` 37 specifies whether more data is available in later calls or not. If ``final`` 38 is true, ``_detectencoding_str()`` will never return ``None``. 39 """ 40 41 # A bit for every candidate 42 CANDIDATE_UTF_8_SIG = 1 43 CANDIDATE_UTF_16_AS_LE = 2 44 CANDIDATE_UTF_16_AS_BE = 4 45 CANDIDATE_UTF_16_LE = 8 46 CANDIDATE_UTF_16_BE = 16 47 CANDIDATE_UTF_32_AS_LE = 32 48 CANDIDATE_UTF_32_AS_BE = 64 49 CANDIDATE_UTF_32_LE = 128 50 CANDIDATE_UTF_32_BE = 256 51 CANDIDATE_CHARSET = 512 52 53 candidates = 1023 # all candidates 54 55 li = len(input) 56 if li>=1: 57 # Check first byte 58 c = input[0] 59 if c != "\xef": 60 candidates &= ~CANDIDATE_UTF_8_SIG 61 if c != "\xff": 62 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE) 63 if c != "\xfe": 64 candidates &= ~CANDIDATE_UTF_16_AS_BE 65 if c != "@": 66 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET) 67 if c != "\x00": 68 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE) 69 if li>=2: 70 # Check second byte 71 c = input[1] 72 if c != "\xbb": 73 candidates &= ~CANDIDATE_UTF_8_SIG 74 if c != "\xfe": 75 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE) 76 if c != "\xff": 77 candidates &= ~CANDIDATE_UTF_16_AS_BE 78 if c != "\x00": 79 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 80 if c != "@": 81 candidates &= ~CANDIDATE_UTF_16_BE 82 if c != "c": 83 candidates &= ~CANDIDATE_CHARSET 84 if li>=3: 85 # Check third byte 86 c = input[2] 87 if c != "\xbf": 88 candidates &= ~CANDIDATE_UTF_8_SIG 89 if c != "c": 90 candidates &= ~CANDIDATE_UTF_16_LE 91 if c != "\x00": 92 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 93 if c != "\xfe": 94 candidates &= ~CANDIDATE_UTF_32_AS_BE 95 if c != "h": 96 candidates &= ~CANDIDATE_CHARSET 97 if li>=4: 98 # Check fourth byte 99 c = input[3] 100 if input[2:4] == "\x00\x00": 101 candidates &= ~CANDIDATE_UTF_16_AS_LE 102 if c != "\x00": 103 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE) 104 if c != "\xff": 105 candidates &= ~CANDIDATE_UTF_32_AS_BE 106 if c != "@": 107 candidates &= ~CANDIDATE_UTF_32_BE 108 if c != "a": 109 candidates &= ~CANDIDATE_CHARSET 110 if candidates == 0: 111 return "utf-8" 112 if not (candidates & (candidates-1)): # only one candidate remaining 113 if candidates == CANDIDATE_UTF_8_SIG and li >= 3: 114 return "utf-8-sig" 115 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2: 116 return "utf-16" 117 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2: 118 return "utf-16" 119 elif candidates == CANDIDATE_UTF_16_LE and li >= 4: 120 return "utf-16-le" 121 elif candidates == CANDIDATE_UTF_16_BE and li >= 2: 122 return "utf-16-be" 123 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4: 124 return "utf-32" 125 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4: 126 return "utf-32" 127 elif candidates == CANDIDATE_UTF_32_LE and li >= 4: 128 return "utf-32-le" 129 elif candidates == CANDIDATE_UTF_32_BE and li >= 4: 130 return "utf-32-be" 131 elif candidates == CANDIDATE_CHARSET and li >= 4: 132 prefix = '@charset "' 133 if input[:len(prefix)] == prefix: 134 pos = input.find('"', len(prefix)) 135 if pos >= 0: 136 return input[len(prefix):pos] 137 # if this is the last call, and we haven't determined an encoding yet, 138 # we default to UTF-8 139 if final: 140 return "utf-8" 141 return None # dont' know yet
142 143
144 -def _detectencoding_unicode(input, final=False):
145 """ 146 Detect the encoding of the unicode string ``input``, which contains the 147 beginning of a CSS file. The encoding is detected from the charset rule 148 at the beginning of ``input``. If there is no charset rule, ``"utf-8"`` 149 will be returned. 150 151 If the encoding can't be detected yet, ``None`` is returned. ``final`` 152 specifies whether more data will be available in later calls or not. If 153 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``. 154 """ 155 prefix = u'@charset "' 156 if input.startswith(prefix): 157 pos = input.find(u'"', len(prefix)) 158 if pos >= 0: 159 return input[len(prefix):pos] 160 elif final or not prefix.startswith(input): 161 # if this is the last call, and we haven't determined an encoding yet, 162 # (or the string definitely doesn't start with prefix) we default to UTF-8 163 return "utf-8" 164 return None # don't know yet
165 166
167 -def _fixencoding(input, encoding, final=False):
168 """ 169 Replace the name of the encoding in the charset rule at the beginning of 170 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset 171 rule, ``input`` will be returned unmodified. 172 173 If the encoding can't be found yet, ``None`` is returned. ``final`` 174 specifies whether more data will be available in later calls or not. 175 If ``final`` is true, ``_fixencoding()`` will never return ``None``. 176 """ 177 prefix = u'@charset "' 178 if len(input) > len(prefix): 179 if input.startswith(prefix): 180 pos = input.find(u'"', len(prefix)) 181 if pos >= 0: 182 if encoding.replace("_", "-").lower() == "utf-8-sig": 183 encoding = u"utf-8" 184 return prefix + encoding + input[pos:] 185 # we haven't seen the end of the encoding name yet => fall through 186 else: 187 return input # doesn't start with prefix, so nothing to fix 188 elif not prefix.startswith(input) or final: 189 # can't turn out to be a @charset rule later (or there is no "later") 190 return input 191 if final: 192 return input 193 return None # don't know yet
194 195
196 -def decode(input, errors="strict", encoding=None):
197 if encoding is None: 198 encoding = _detectencoding_str(input, True) 199 if encoding == "css": 200 raise ValueError("css not allowed as encoding name") 201 (input, consumed) = codecs.getdecoder(encoding)(input, errors) 202 return (_fixencoding(input, unicode(encoding), True), consumed)
203 204
205 -def encode(input, errors="strict", encoding=None):
206 consumed = len(input) 207 if encoding is None: 208 encoding = _detectencoding_unicode(input, True) 209 if encoding.replace("_", "-").lower() == "utf-8-sig": 210 input = _fixencoding(input, u"utf-8", True) 211 else: 212 input = _fixencoding(input, unicode(encoding), True) 213 if encoding == "css": 214 raise ValueError("css not allowed as encoding name") 215 encoder = codecs.getencoder(encoding) 216 return (encoder(input, errors)[0], consumed)
217 218
219 -def _bytes2int(bytes):
220 # Helper: convert an 8 bit string into an ``int``. 221 i = 0 222 for byte in bytes: 223 i = (i<<8) + ord(byte) 224 return i
225 226
227 -def _int2bytes(i):
228 # Helper: convert an ``int`` into an 8-bit string. 229 v = [] 230 while i: 231 v.insert(0, chr(i&0xff)) 232 i >>= 8 233 return "".join(v)
234 235 236 if hasattr(codecs, "IncrementalDecoder"):
237 - class IncrementalDecoder(codecs.IncrementalDecoder):
238 - def __init__(self, errors="strict", encoding=None):
239 self.decoder = None 240 self.encoding = encoding 241 codecs.IncrementalDecoder.__init__(self, errors) 242 # Store ``errors`` somewhere else, 243 # because we have to hide it in a property 244 self._errors = errors 245 self.buffer = "" 246 self.headerfixed = False
247
248 - def iterdecode(self, input):
249 for part in input: 250 result = self.decode(part, False) 251 if result: 252 yield result 253 result = self.decode("", True) 254 if result: 255 yield result
256
257 - def decode(self, input, final=False):
258 # We're doing basically the same as a ``BufferedIncrementalDecoder``, 259 # but since the buffer is only relevant until the encoding has been 260 # detected (in which case the buffer of the underlying codec might 261 # kick in), we're implementing buffering ourselves to avoid some 262 # overhead. 263 if self.decoder is None: 264 input = self.buffer + input 265 self.encoding = _detectencoding_str(input, final) 266 if self.encoding is None: 267 self.buffer = input # retry the complete input on the next call 268 return u"" # no encoding determined yet, so no output 269 if self.encoding == "css": 270 raise ValueError("css not allowed as encoding name") 271 self.buffer = "" # drop buffer, as the decoder might keep its own 272 decoder = codecs.getincrementaldecoder(self.encoding) 273 self.decoder = decoder(self._errors) 274 if self.headerfixed: 275 return self.decoder.decode(input, final) 276 # If we haven't fixed the header yet, 277 # the content of ``self.buffer`` is a ``unicode`` object 278 output = self.buffer + self.decoder.decode(input, final) 279 encoding = self.encoding 280 if encoding.replace("_", "-").lower() == "utf-8-sig": 281 encoding = "utf-8" 282 newoutput = _fixencoding(output, unicode(encoding), final) 283 if newoutput is None: 284 # retry fixing the @charset rule (but keep the decoded stuff) 285 self.buffer = output 286 return u"" 287 self.headerfixed = True 288 return newoutput
289
290 - def reset(self):
291 codecs.IncrementalDecoder.reset(self) 292 self.decoder = None 293 self.buffer = "" 294 self.headerfixed = False
295
296 - def _geterrors(self):
297 return self._errors
298
299 - def _seterrors(self, errors):
300 # Setting ``errors`` must be done on the real decoder too 301 if self.decoder is not None: 302 self.decoder.errors = errors 303 self._errors = errors
304 errors = property(_geterrors, _seterrors) 305
306 - def getstate(self):
307 if self.decoder is not None: 308 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate()) 309 else: 310 state = (self.encoding, self.buffer, self.headerfixed, False, None) 311 return ("", _bytes2int(marshal.dumps(state)))
312
313 - def setstate(self, state):
314 state = _int2bytes(marshal.loads(state[1])) # ignore buffered input 315 self.encoding = state[0] 316 self.buffer = state[1] 317 self.headerfixed = state[2] 318 if state[3] is not None: 319 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors) 320 self.decoder.setstate(state[4]) 321 else: 322 self.decoder = None
323 324 325 if hasattr(codecs, "IncrementalEncoder"):
326 - class IncrementalEncoder(codecs.IncrementalEncoder):
327 - def __init__(self, errors="strict", encoding=None):
328 self.encoder = None 329 self.encoding = encoding 330 codecs.IncrementalEncoder.__init__(self, errors) 331 # Store ``errors`` somewhere else, 332 # because we have to hide it in a property 333 self._errors = errors 334 self.buffer = u""
335
336 - def iterencode(self, input):
337 for part in input: 338 result = self.encode(part, False) 339 if result: 340 yield result 341 result = self.encode(u"", True) 342 if result: 343 yield result
344
345 - def encode(self, input, final=False):
346 if self.encoder is None: 347 input = self.buffer + input 348 if self.encoding is not None: 349 # Replace encoding in the @charset rule with the specified one 350 encoding = self.encoding 351 if encoding.replace("_", "-").lower() == "utf-8-sig": 352 encoding = "utf-8" 353 newinput = _fixencoding(input, unicode(encoding), final) 354 if newinput is None: # @charset rule incomplete => Retry next time 355 self.buffer = input 356 return "" 357 input = newinput 358 else: 359 # Use encoding from the @charset declaration 360 self.encoding = _detectencoding_unicode(input, final) 361 if self.encoding is not None: 362 if self.encoding == "css": 363 raise ValueError("css not allowed as encoding name") 364 info = codecs.lookup(self.encoding) 365 encoding = self.encoding 366 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 367 input = _fixencoding(input, u"utf-8", True) 368 self.encoder = info.incrementalencoder(self._errors) 369 self.buffer = u"" 370 else: 371 self.buffer = input 372 return "" 373 return self.encoder.encode(input, final)
374
375 - def reset(self):
376 codecs.IncrementalEncoder.reset(self) 377 self.encoder = None 378 self.buffer = u""
379
380 - def _geterrors(self):
381 return self._errors
382
383 - def _seterrors(self, errors):
384 # Setting ``errors ``must be done on the real encoder too 385 if self.encoder is not None: 386 self.encoder.errors = errors 387 self._errors = errors
388 errors = property(_geterrors, _seterrors) 389
390 - def getstate(self):
391 if self.encoder is not None: 392 state = (self.encoding, self.buffer, True, self.encoder.getstate()) 393 else: 394 state = (self.encoding, self.buffer, False, None) 395 return _bytes2int(marshal.dumps(state))
396
397 - def setstate(self, state):
398 state = _int2bytes(marshal.loads(state)) 399 self.encoding = state[0] 400 self.buffer = state[1] 401 if state[2] is not None: 402 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors) 403 self.encoder.setstate(state[4]) 404 else: 405 self.encoder = None
406 407
408 -class StreamWriter(codecs.StreamWriter):
409 - def __init__(self, stream, errors="strict", encoding=None, header=False):
410 codecs.StreamWriter.__init__(self, stream, errors) 411 self.streamwriter = None 412 self.encoding = encoding 413 self._errors = errors 414 self.buffer = u""
415
416 - def encode(self, input, errors='strict'):
417 li = len(input) 418 if self.streamwriter is None: 419 input = self.buffer + input 420 li = len(input) 421 if self.encoding is not None: 422 # Replace encoding in the @charset rule with the specified one 423 encoding = self.encoding 424 if encoding.replace("_", "-").lower() == "utf-8-sig": 425 encoding = "utf-8" 426 newinput = _fixencoding(input, unicode(encoding), False) 427 if newinput is None: # @charset rule incomplete => Retry next time 428 self.buffer = input 429 return ("", 0) 430 input = newinput 431 else: 432 # Use encoding from the @charset declaration 433 self.encoding = _detectencoding_unicode(input, False) 434 if self.encoding is not None: 435 if self.encoding == "css": 436 raise ValueError("css not allowed as encoding name") 437 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors) 438 encoding = self.encoding 439 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 440 input = _fixencoding(input, u"utf-8", True) 441 self.buffer = u"" 442 else: 443 self.buffer = input 444 return ("", 0) 445 return (self.streamwriter.encode(input, errors)[0], li)
446
447 - def _geterrors(self):
448 return self._errors
449
450 - def _seterrors(self, errors):
451 # Setting ``errors`` must be done on the streamwriter too 452 if self.streamwriter is not None: 453 self.streamwriter.errors = errors 454 self._errors = errors
455 errors = property(_geterrors, _seterrors)
456 457
458 -class StreamReader(codecs.StreamReader):
459 - def __init__(self, stream, errors="strict", encoding=None):
460 codecs.StreamReader.__init__(self, stream, errors) 461 self.streamreader = None 462 self.encoding = encoding 463 self._errors = errors
464
465 - def decode(self, input, errors='strict'):
466 if self.streamreader is None: 467 if self.encoding is None: 468 self.encoding = _detectencoding_str(input, False) 469 if self.encoding is None: 470 return (u"", 0) # no encoding determined yet, so no output 471 if self.encoding == "css": 472 raise ValueError("css not allowed as encoding name") 473 streamreader = codecs.getreader(self.encoding) 474 streamreader = streamreader(self.stream, self._errors) 475 (output, consumed) = streamreader.decode(input, errors) 476 encoding = self.encoding 477 if encoding.replace("_", "-").lower() == "utf-8-sig": 478 encoding = "utf-8" 479 newoutput = _fixencoding(output, unicode(encoding), False) 480 if newoutput is not None: 481 self.streamreader = streamreader 482 return (newoutput, consumed) 483 return (u"", 0) # we will create a new streamreader on the next call 484 return self.streamreader.decode(input, errors)
485
486 - def _geterrors(self):
487 return self._errors
488
489 - def _seterrors(self, errors):
490 # Setting ``errors`` must be done on the streamreader too 491 if self.streamreader is not None: 492 self.streamreader.errors = errors 493 self._errors = errors
494 errors = property(_geterrors, _seterrors)
495 496 497 if hasattr(codecs, "CodecInfo"): 498 # We're running on Python 2.5 or better
499 - def search_function(name):
500 if name == "css": 501 return codecs.CodecInfo( 502 name="css", 503 encode=encode, 504 decode=decode, 505 incrementalencoder=IncrementalEncoder, 506 incrementaldecoder=IncrementalDecoder, 507 streamwriter=StreamWriter, 508 streamreader=StreamReader, 509 )
510 else: 511 # If we're running on Python 2.4, define the utf-8-sig codec here
512 - def utf8sig_encode(input, errors='strict'):
513 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
514
515 - def utf8sig_decode(input, errors='strict'):
516 prefix = 0 517 if input[:3] == codecs.BOM_UTF8: 518 input = input[3:] 519 prefix = 3 520 (output, consumed) = codecs.utf_8_decode(input, errors, True) 521 return (output, consumed+prefix)
522
523 - class UTF8SigStreamWriter(codecs.StreamWriter):
524 - def reset(self):
525 codecs.StreamWriter.reset(self) 526 try: 527 del self.encode 528 except AttributeError: 529 pass
530
531 - def encode(self, input, errors='strict'):
532 self.encode = codecs.utf_8_encode 533 return utf8sig_encode(input, errors)
534
535 - class UTF8SigStreamReader(codecs.StreamReader):
536 - def reset(self):
537 codecs.StreamReader.reset(self) 538 try: 539 del self.decode 540 except AttributeError: 541 pass
542
543 - def decode(self, input, errors='strict'):
544 if len(input) < 3 and codecs.BOM_UTF8.startswith(input): 545 # not enough data to decide if this is a BOM 546 # => try again on the next call 547 return (u"", 0) 548 self.decode = codecs.utf_8_decode 549 return utf8sig_decode(input, errors)
550
551 - def search_function(name):
552 import encodings 553 name = encodings.normalize_encoding(name) 554 if name == "css": 555 return (encode, decode, StreamReader, StreamWriter) 556 elif name == "utf_8_sig": 557 return (utf8sig_encode, utf8sig_decode, UTF8SigStreamReader, UTF8SigStreamWriter)
558 559 560 codecs.register(search_function) 561 562 563 # Error handler for CSS escaping 564
565 -def cssescape(exc):
566 if not isinstance(exc, UnicodeEncodeError): 567 raise TypeError("don't know how to handle %r" % exc) 568 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
569 570 codecs.register_error("cssescape", cssescape) 571