1# -*- coding: utf-8 -*- 2"""Beautiful Soup bonus library: Unicode, Dammit 3 4This library converts a bytestream to Unicode through any means 5necessary. It is heavily based on code from Mark Pilgrim's Universal 6Feed Parser. It works best on XML and HTML, but it does not rewrite the 7XML or HTML to reflect a new encoding; that's the tree builder's job. 8""" 9__license__ = "MIT" 10 11from pdb import set_trace 12import codecs 13from html.entities import codepoint2name 14import re 15import logging 16import string 17 18# Import a library to autodetect character encodings. 19chardet_type = None 20try: 21 # First try the fast C implementation. 22 # PyPI package: cchardet 23 import cchardet 24 def chardet_dammit(s): 25 return cchardet.detect(s)['encoding'] 26except ImportError: 27 try: 28 # Fall back to the pure Python implementation 29 # Debian package: python-chardet 30 # PyPI package: chardet 31 import chardet 32 def chardet_dammit(s): 33 return chardet.detect(s)['encoding'] 34 #import chardet.constants 35 #chardet.constants._debug = 1 36 except ImportError: 37 # No chardet available. 38 def chardet_dammit(s): 39 return None 40 41# Available from http://cjkpython.i18n.org/. 42try: 43 import iconv_codec 44except ImportError: 45 pass 46 47xml_encoding_re = re.compile( 48 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 49html_meta_re = re.compile( 50 r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 51 52class EntitySubstitution(object): 53 54 """Substitute XML or HTML entities for the corresponding characters.""" 55 56 def _populate_class_variables(): 57 lookup = {} 58 reverse_lookup = {} 59 characters_for_re = [] 60 for codepoint, name in list(codepoint2name.items()): 61 character = chr(codepoint) 62 if codepoint != 34: 63 # There's no point in turning the quotation mark into 64 # ", unless it happens within an attribute value, which 65 # is handled elsewhere. 66 characters_for_re.append(character) 67 lookup[character] = name 68 # But we do want to turn " into the quotation mark. 69 reverse_lookup[name] = character 70 re_definition = "[%s]" % "".join(characters_for_re) 71 return lookup, reverse_lookup, re.compile(re_definition) 72 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 73 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 74 75 CHARACTER_TO_XML_ENTITY = { 76 "'": "apos", 77 '"': "quot", 78 "&": "amp", 79 "<": "lt", 80 ">": "gt", 81 } 82 83 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" 84 r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 85 r")") 86 87 AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") 88 89 @classmethod 90 def _substitute_html_entity(cls, matchobj): 91 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 92 return "&%s;" % entity 93 94 @classmethod 95 def _substitute_xml_entity(cls, matchobj): 96 """Used with a regular expression to substitute the 97 appropriate XML entity for an XML special character.""" 98 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 99 return "&%s;" % entity 100 101 @classmethod 102 def quoted_attribute_value(self, value): 103 """Make a value into a quoted XML attribute, possibly escaping it. 104 105 Most strings will be quoted using double quotes. 106 107 Bob's Bar -> "Bob's Bar" 108 109 If a string contains double quotes, it will be quoted using 110 single quotes. 111 112 Welcome to "my bar" -> 'Welcome to "my bar"' 113 114 If a string contains both single and double quotes, the 115 double quotes will be escaped, and the string will be quoted 116 using double quotes. 117 118 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 119 """ 120 quote_with = '"' 121 if '"' in value: 122 if "'" in value: 123 # The string contains both single and double 124 # quotes. Turn the double quotes into 125 # entities. We quote the double quotes rather than 126 # the single quotes because the entity name is 127 # """ whether this is HTML or XML. If we 128 # quoted the single quotes, we'd have to decide 129 # between ' and &squot;. 130 replace_with = """ 131 value = value.replace('"', replace_with) 132 else: 133 # There are double quotes but no single quotes. 134 # We can use single quotes to quote the attribute. 135 quote_with = "'" 136 return quote_with + value + quote_with 137 138 @classmethod 139 def substitute_xml(cls, value, make_quoted_attribute=False): 140 """Substitute XML entities for special XML characters. 141 142 :param value: A string to be substituted. The less-than sign 143 will become <, the greater-than sign will become >, 144 and any ampersands will become &. If you want ampersands 145 that appear to be part of an entity definition to be left 146 alone, use substitute_xml_containing_entities() instead. 147 148 :param make_quoted_attribute: If True, then the string will be 149 quoted, as befits an attribute value. 150 """ 151 # Escape angle brackets and ampersands. 152 value = cls.AMPERSAND_OR_BRACKET.sub( 153 cls._substitute_xml_entity, value) 154 155 if make_quoted_attribute: 156 value = cls.quoted_attribute_value(value) 157 return value 158 159 @classmethod 160 def substitute_xml_containing_entities( 161 cls, value, make_quoted_attribute=False): 162 """Substitute XML entities for special XML characters. 163 164 :param value: A string to be substituted. The less-than sign will 165 become <, the greater-than sign will become >, and any 166 ampersands that are not part of an entity defition will 167 become &. 168 169 :param make_quoted_attribute: If True, then the string will be 170 quoted, as befits an attribute value. 171 """ 172 # Escape angle brackets, and ampersands that aren't part of 173 # entities. 174 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 175 cls._substitute_xml_entity, value) 176 177 if make_quoted_attribute: 178 value = cls.quoted_attribute_value(value) 179 return value 180 181 @classmethod 182 def substitute_html(cls, s): 183 """Replace certain Unicode characters with named HTML entities. 184 185 This differs from data.encode(encoding, 'xmlcharrefreplace') 186 in that the goal is to make the result more readable (to those 187 with ASCII displays) rather than to recover from 188 errors. There's absolutely nothing wrong with a UTF-8 string 189 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 190 character with "é" will make it more readable to some 191 people. 192 """ 193 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 194 cls._substitute_html_entity, s) 195 196 197class EncodingDetector: 198 """Suggests a number of possible encodings for a bytestring. 199 200 Order of precedence: 201 202 1. Encodings you specifically tell EncodingDetector to try first 203 (the override_encodings argument to the constructor). 204 205 2. An encoding declared within the bytestring itself, either in an 206 XML declaration (if the bytestring is to be interpreted as an XML 207 document), or in a <meta> tag (if the bytestring is to be 208 interpreted as an HTML document.) 209 210 3. An encoding detected through textual analysis by chardet, 211 cchardet, or a similar external library. 212 213 4. UTF-8. 214 215 5. Windows-1252. 216 """ 217 def __init__(self, markup, override_encodings=None, is_html=False, 218 exclude_encodings=None): 219 self.override_encodings = override_encodings or [] 220 exclude_encodings = exclude_encodings or [] 221 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 222 self.chardet_encoding = None 223 self.is_html = is_html 224 self.declared_encoding = None 225 226 # First order of business: strip a byte-order mark. 227 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 228 229 def _usable(self, encoding, tried): 230 if encoding is not None: 231 encoding = encoding.lower() 232 if encoding in self.exclude_encodings: 233 return False 234 if encoding not in tried: 235 tried.add(encoding) 236 return True 237 return False 238 239 @property 240 def encodings(self): 241 """Yield a number of encodings that might work for this markup.""" 242 tried = set() 243 for e in self.override_encodings: 244 if self._usable(e, tried): 245 yield e 246 247 # Did the document originally start with a byte-order mark 248 # that indicated its encoding? 249 if self._usable(self.sniffed_encoding, tried): 250 yield self.sniffed_encoding 251 252 # Look within the document for an XML or HTML encoding 253 # declaration. 254 if self.declared_encoding is None: 255 self.declared_encoding = self.find_declared_encoding( 256 self.markup, self.is_html) 257 if self._usable(self.declared_encoding, tried): 258 yield self.declared_encoding 259 260 # Use third-party character set detection to guess at the 261 # encoding. 262 if self.chardet_encoding is None: 263 self.chardet_encoding = chardet_dammit(self.markup) 264 if self._usable(self.chardet_encoding, tried): 265 yield self.chardet_encoding 266 267 # As a last-ditch effort, try utf-8 and windows-1252. 268 for e in ('utf-8', 'windows-1252'): 269 if self._usable(e, tried): 270 yield e 271 272 @classmethod 273 def strip_byte_order_mark(cls, data): 274 """If a byte-order mark is present, strip it and return the encoding it implies.""" 275 encoding = None 276 if isinstance(data, str): 277 # Unicode data cannot have a byte-order mark. 278 return data, encoding 279 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 280 and (data[2:4] != '\x00\x00'): 281 encoding = 'utf-16be' 282 data = data[2:] 283 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 284 and (data[2:4] != '\x00\x00'): 285 encoding = 'utf-16le' 286 data = data[2:] 287 elif data[:3] == b'\xef\xbb\xbf': 288 encoding = 'utf-8' 289 data = data[3:] 290 elif data[:4] == b'\x00\x00\xfe\xff': 291 encoding = 'utf-32be' 292 data = data[4:] 293 elif data[:4] == b'\xff\xfe\x00\x00': 294 encoding = 'utf-32le' 295 data = data[4:] 296 return data, encoding 297 298 @classmethod 299 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 300 """Given a document, tries to find its declared encoding. 301 302 An XML encoding is declared at the beginning of the document. 303 304 An HTML encoding is declared in a <meta> tag, hopefully near the 305 beginning of the document. 306 """ 307 if search_entire_document: 308 xml_endpos = html_endpos = len(markup) 309 else: 310 xml_endpos = 1024 311 html_endpos = max(2048, int(len(markup) * 0.05)) 312 313 declared_encoding = None 314 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 315 if not declared_encoding_match and is_html: 316 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 317 if declared_encoding_match is not None: 318 declared_encoding = declared_encoding_match.groups()[0].decode( 319 'ascii', 'replace') 320 if declared_encoding: 321 return declared_encoding.lower() 322 return None 323 324class UnicodeDammit: 325 """A class for detecting the encoding of a *ML document and 326 converting it to a Unicode string. If the source encoding is 327 windows-1252, can replace MS smart quotes with their HTML or XML 328 equivalents.""" 329 330 # This dictionary maps commonly seen values for "charset" in HTML 331 # meta tags to the corresponding Python codec names. It only covers 332 # values that aren't in Python's aliases and can't be determined 333 # by the heuristics in find_codec. 334 CHARSET_ALIASES = {"macintosh": "mac-roman", 335 "x-sjis": "shift-jis"} 336 337 ENCODINGS_WITH_SMART_QUOTES = [ 338 "windows-1252", 339 "iso-8859-1", 340 "iso-8859-2", 341 ] 342 343 def __init__(self, markup, override_encodings=[], 344 smart_quotes_to=None, is_html=False, exclude_encodings=[]): 345 self.smart_quotes_to = smart_quotes_to 346 self.tried_encodings = [] 347 self.contains_replacement_characters = False 348 self.is_html = is_html 349 350 self.detector = EncodingDetector( 351 markup, override_encodings, is_html, exclude_encodings) 352 353 # Short-circuit if the data is in Unicode to begin with. 354 if isinstance(markup, str) or markup == '': 355 self.markup = markup 356 self.unicode_markup = str(markup) 357 self.original_encoding = None 358 return 359 360 # The encoding detector may have stripped a byte-order mark. 361 # Use the stripped markup from this point on. 362 self.markup = self.detector.markup 363 364 u = None 365 for encoding in self.detector.encodings: 366 markup = self.detector.markup 367 u = self._convert_from(encoding) 368 if u is not None: 369 break 370 371 if not u: 372 # None of the encodings worked. As an absolute last resort, 373 # try them again with character replacement. 374 375 for encoding in self.detector.encodings: 376 if encoding != "ascii": 377 u = self._convert_from(encoding, "replace") 378 if u is not None: 379 logging.warning( 380 "Some characters could not be decoded, and were " 381 "replaced with REPLACEMENT CHARACTER.") 382 self.contains_replacement_characters = True 383 break 384 385 # If none of that worked, we could at this point force it to 386 # ASCII, but that would destroy so much data that I think 387 # giving up is better. 388 self.unicode_markup = u 389 if not u: 390 self.original_encoding = None 391 392 def _sub_ms_char(self, match): 393 """Changes a MS smart quote character to an XML or HTML 394 entity, or an ASCII character.""" 395 orig = match.group(1) 396 if self.smart_quotes_to == 'ascii': 397 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 398 else: 399 sub = self.MS_CHARS.get(orig) 400 if type(sub) == tuple: 401 if self.smart_quotes_to == 'xml': 402 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 403 else: 404 sub = '&'.encode() + sub[0].encode() + ';'.encode() 405 else: 406 sub = sub.encode() 407 return sub 408 409 def _convert_from(self, proposed, errors="strict"): 410 proposed = self.find_codec(proposed) 411 if not proposed or (proposed, errors) in self.tried_encodings: 412 return None 413 self.tried_encodings.append((proposed, errors)) 414 markup = self.markup 415 # Convert smart quotes to HTML if coming from an encoding 416 # that might have them. 417 if (self.smart_quotes_to is not None 418 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 419 smart_quotes_re = b"([\x80-\x9f])" 420 smart_quotes_compiled = re.compile(smart_quotes_re) 421 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 422 423 try: 424 #print "Trying to convert document to %s (errors=%s)" % ( 425 # proposed, errors) 426 u = self._to_unicode(markup, proposed, errors) 427 self.markup = u 428 self.original_encoding = proposed 429 except Exception as e: 430 #print "That didn't work!" 431 #print e 432 return None 433 #print "Correct encoding: %s" % proposed 434 return self.markup 435 436 def _to_unicode(self, data, encoding, errors="strict"): 437 '''Given a string and its encoding, decodes the string into Unicode. 438 %encoding is a string recognized by encodings.aliases''' 439 return str(data, encoding, errors) 440 441 @property 442 def declared_html_encoding(self): 443 if not self.is_html: 444 return None 445 return self.detector.declared_encoding 446 447 def find_codec(self, charset): 448 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 449 or (charset and self._codec(charset.replace("-", ""))) 450 or (charset and self._codec(charset.replace("-", "_"))) 451 or (charset and charset.lower()) 452 or charset 453 ) 454 if value: 455 return value.lower() 456 return None 457 458 def _codec(self, charset): 459 if not charset: 460 return charset 461 codec = None 462 try: 463 codecs.lookup(charset) 464 codec = charset 465 except (LookupError, ValueError): 466 pass 467 return codec 468 469 470 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 471 MS_CHARS = {b'\x80': ('euro', '20AC'), 472 b'\x81': ' ', 473 b'\x82': ('sbquo', '201A'), 474 b'\x83': ('fnof', '192'), 475 b'\x84': ('bdquo', '201E'), 476 b'\x85': ('hellip', '2026'), 477 b'\x86': ('dagger', '2020'), 478 b'\x87': ('Dagger', '2021'), 479 b'\x88': ('circ', '2C6'), 480 b'\x89': ('permil', '2030'), 481 b'\x8A': ('Scaron', '160'), 482 b'\x8B': ('lsaquo', '2039'), 483 b'\x8C': ('OElig', '152'), 484 b'\x8D': '?', 485 b'\x8E': ('#x17D', '17D'), 486 b'\x8F': '?', 487 b'\x90': '?', 488 b'\x91': ('lsquo', '2018'), 489 b'\x92': ('rsquo', '2019'), 490 b'\x93': ('ldquo', '201C'), 491 b'\x94': ('rdquo', '201D'), 492 b'\x95': ('bull', '2022'), 493 b'\x96': ('ndash', '2013'), 494 b'\x97': ('mdash', '2014'), 495 b'\x98': ('tilde', '2DC'), 496 b'\x99': ('trade', '2122'), 497 b'\x9a': ('scaron', '161'), 498 b'\x9b': ('rsaquo', '203A'), 499 b'\x9c': ('oelig', '153'), 500 b'\x9d': '?', 501 b'\x9e': ('#x17E', '17E'), 502 b'\x9f': ('Yuml', ''),} 503 504 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 505 # horrors like stripping diacritical marks to turn á into a, but also 506 # contains non-horrors like turning “ into ". 507 MS_CHARS_TO_ASCII = { 508 b'\x80' : 'EUR', 509 b'\x81' : ' ', 510 b'\x82' : ',', 511 b'\x83' : 'f', 512 b'\x84' : ',,', 513 b'\x85' : '...', 514 b'\x86' : '+', 515 b'\x87' : '++', 516 b'\x88' : '^', 517 b'\x89' : '%', 518 b'\x8a' : 'S', 519 b'\x8b' : '<', 520 b'\x8c' : 'OE', 521 b'\x8d' : '?', 522 b'\x8e' : 'Z', 523 b'\x8f' : '?', 524 b'\x90' : '?', 525 b'\x91' : "'", 526 b'\x92' : "'", 527 b'\x93' : '"', 528 b'\x94' : '"', 529 b'\x95' : '*', 530 b'\x96' : '-', 531 b'\x97' : '--', 532 b'\x98' : '~', 533 b'\x99' : '(TM)', 534 b'\x9a' : 's', 535 b'\x9b' : '>', 536 b'\x9c' : 'oe', 537 b'\x9d' : '?', 538 b'\x9e' : 'z', 539 b'\x9f' : 'Y', 540 b'\xa0' : ' ', 541 b'\xa1' : '!', 542 b'\xa2' : 'c', 543 b'\xa3' : 'GBP', 544 b'\xa4' : '$', #This approximation is especially parochial--this is the 545 #generic currency symbol. 546 b'\xa5' : 'YEN', 547 b'\xa6' : '|', 548 b'\xa7' : 'S', 549 b'\xa8' : '..', 550 b'\xa9' : '', 551 b'\xaa' : '(th)', 552 b'\xab' : '<<', 553 b'\xac' : '!', 554 b'\xad' : ' ', 555 b'\xae' : '(R)', 556 b'\xaf' : '-', 557 b'\xb0' : 'o', 558 b'\xb1' : '+-', 559 b'\xb2' : '2', 560 b'\xb3' : '3', 561 b'\xb4' : ("'", 'acute'), 562 b'\xb5' : 'u', 563 b'\xb6' : 'P', 564 b'\xb7' : '*', 565 b'\xb8' : ',', 566 b'\xb9' : '1', 567 b'\xba' : '(th)', 568 b'\xbb' : '>>', 569 b'\xbc' : '1/4', 570 b'\xbd' : '1/2', 571 b'\xbe' : '3/4', 572 b'\xbf' : '?', 573 b'\xc0' : 'A', 574 b'\xc1' : 'A', 575 b'\xc2' : 'A', 576 b'\xc3' : 'A', 577 b'\xc4' : 'A', 578 b'\xc5' : 'A', 579 b'\xc6' : 'AE', 580 b'\xc7' : 'C', 581 b'\xc8' : 'E', 582 b'\xc9' : 'E', 583 b'\xca' : 'E', 584 b'\xcb' : 'E', 585 b'\xcc' : 'I', 586 b'\xcd' : 'I', 587 b'\xce' : 'I', 588 b'\xcf' : 'I', 589 b'\xd0' : 'D', 590 b'\xd1' : 'N', 591 b'\xd2' : 'O', 592 b'\xd3' : 'O', 593 b'\xd4' : 'O', 594 b'\xd5' : 'O', 595 b'\xd6' : 'O', 596 b'\xd7' : '*', 597 b'\xd8' : 'O', 598 b'\xd9' : 'U', 599 b'\xda' : 'U', 600 b'\xdb' : 'U', 601 b'\xdc' : 'U', 602 b'\xdd' : 'Y', 603 b'\xde' : 'b', 604 b'\xdf' : 'B', 605 b'\xe0' : 'a', 606 b'\xe1' : 'a', 607 b'\xe2' : 'a', 608 b'\xe3' : 'a', 609 b'\xe4' : 'a', 610 b'\xe5' : 'a', 611 b'\xe6' : 'ae', 612 b'\xe7' : 'c', 613 b'\xe8' : 'e', 614 b'\xe9' : 'e', 615 b'\xea' : 'e', 616 b'\xeb' : 'e', 617 b'\xec' : 'i', 618 b'\xed' : 'i', 619 b'\xee' : 'i', 620 b'\xef' : 'i', 621 b'\xf0' : 'o', 622 b'\xf1' : 'n', 623 b'\xf2' : 'o', 624 b'\xf3' : 'o', 625 b'\xf4' : 'o', 626 b'\xf5' : 'o', 627 b'\xf6' : 'o', 628 b'\xf7' : '/', 629 b'\xf8' : 'o', 630 b'\xf9' : 'u', 631 b'\xfa' : 'u', 632 b'\xfb' : 'u', 633 b'\xfc' : 'u', 634 b'\xfd' : 'y', 635 b'\xfe' : 'b', 636 b'\xff' : 'y', 637 } 638 639 # A map used when removing rogue Windows-1252/ISO-8859-1 640 # characters in otherwise UTF-8 documents. 641 # 642 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 643 # Windows-1252. 644 WINDOWS_1252_TO_UTF8 = { 645 0x80 : b'\xe2\x82\xac', # € 646 0x82 : b'\xe2\x80\x9a', # ‚ 647 0x83 : b'\xc6\x92', # ƒ 648 0x84 : b'\xe2\x80\x9e', # „ 649 0x85 : b'\xe2\x80\xa6', # … 650 0x86 : b'\xe2\x80\xa0', # † 651 0x87 : b'\xe2\x80\xa1', # ‡ 652 0x88 : b'\xcb\x86', # ˆ 653 0x89 : b'\xe2\x80\xb0', # ‰ 654 0x8a : b'\xc5\xa0', # Š 655 0x8b : b'\xe2\x80\xb9', # ‹ 656 0x8c : b'\xc5\x92', # Œ 657 0x8e : b'\xc5\xbd', # Ž 658 0x91 : b'\xe2\x80\x98', # ‘ 659 0x92 : b'\xe2\x80\x99', # ’ 660 0x93 : b'\xe2\x80\x9c', # “ 661 0x94 : b'\xe2\x80\x9d', # ” 662 0x95 : b'\xe2\x80\xa2', # • 663 0x96 : b'\xe2\x80\x93', # – 664 0x97 : b'\xe2\x80\x94', # — 665 0x98 : b'\xcb\x9c', # ˜ 666 0x99 : b'\xe2\x84\xa2', # ™ 667 0x9a : b'\xc5\xa1', # š 668 0x9b : b'\xe2\x80\xba', # › 669 0x9c : b'\xc5\x93', # œ 670 0x9e : b'\xc5\xbe', # ž 671 0x9f : b'\xc5\xb8', # Ÿ 672 0xa0 : b'\xc2\xa0', # 673 0xa1 : b'\xc2\xa1', # ¡ 674 0xa2 : b'\xc2\xa2', # ¢ 675 0xa3 : b'\xc2\xa3', # £ 676 0xa4 : b'\xc2\xa4', # ¤ 677 0xa5 : b'\xc2\xa5', # ¥ 678 0xa6 : b'\xc2\xa6', # ¦ 679 0xa7 : b'\xc2\xa7', # § 680 0xa8 : b'\xc2\xa8', # ¨ 681 0xa9 : b'\xc2\xa9', # © 682 0xaa : b'\xc2\xaa', # ª 683 0xab : b'\xc2\xab', # « 684 0xac : b'\xc2\xac', # ¬ 685 0xad : b'\xc2\xad', # 686 0xae : b'\xc2\xae', # ® 687 0xaf : b'\xc2\xaf', # ¯ 688 0xb0 : b'\xc2\xb0', # ° 689 0xb1 : b'\xc2\xb1', # ± 690 0xb2 : b'\xc2\xb2', # ² 691 0xb3 : b'\xc2\xb3', # ³ 692 0xb4 : b'\xc2\xb4', # ´ 693 0xb5 : b'\xc2\xb5', # µ 694 0xb6 : b'\xc2\xb6', # ¶ 695 0xb7 : b'\xc2\xb7', # · 696 0xb8 : b'\xc2\xb8', # ¸ 697 0xb9 : b'\xc2\xb9', # ¹ 698 0xba : b'\xc2\xba', # º 699 0xbb : b'\xc2\xbb', # » 700 0xbc : b'\xc2\xbc', # ¼ 701 0xbd : b'\xc2\xbd', # ½ 702 0xbe : b'\xc2\xbe', # ¾ 703 0xbf : b'\xc2\xbf', # ¿ 704 0xc0 : b'\xc3\x80', # À 705 0xc1 : b'\xc3\x81', # Á 706 0xc2 : b'\xc3\x82', #  707 0xc3 : b'\xc3\x83', # à 708 0xc4 : b'\xc3\x84', # Ä 709 0xc5 : b'\xc3\x85', # Å 710 0xc6 : b'\xc3\x86', # Æ 711 0xc7 : b'\xc3\x87', # Ç 712 0xc8 : b'\xc3\x88', # È 713 0xc9 : b'\xc3\x89', # É 714 0xca : b'\xc3\x8a', # Ê 715 0xcb : b'\xc3\x8b', # Ë 716 0xcc : b'\xc3\x8c', # Ì 717 0xcd : b'\xc3\x8d', # Í 718 0xce : b'\xc3\x8e', # Î 719 0xcf : b'\xc3\x8f', # Ï 720 0xd0 : b'\xc3\x90', # Ð 721 0xd1 : b'\xc3\x91', # Ñ 722 0xd2 : b'\xc3\x92', # Ò 723 0xd3 : b'\xc3\x93', # Ó 724 0xd4 : b'\xc3\x94', # Ô 725 0xd5 : b'\xc3\x95', # Õ 726 0xd6 : b'\xc3\x96', # Ö 727 0xd7 : b'\xc3\x97', # × 728 0xd8 : b'\xc3\x98', # Ø 729 0xd9 : b'\xc3\x99', # Ù 730 0xda : b'\xc3\x9a', # Ú 731 0xdb : b'\xc3\x9b', # Û 732 0xdc : b'\xc3\x9c', # Ü 733 0xdd : b'\xc3\x9d', # Ý 734 0xde : b'\xc3\x9e', # Þ 735 0xdf : b'\xc3\x9f', # ß 736 0xe0 : b'\xc3\xa0', # à 737 0xe1 : b'\xa1', # á 738 0xe2 : b'\xc3\xa2', # â 739 0xe3 : b'\xc3\xa3', # ã 740 0xe4 : b'\xc3\xa4', # ä 741 0xe5 : b'\xc3\xa5', # å 742 0xe6 : b'\xc3\xa6', # æ 743 0xe7 : b'\xc3\xa7', # ç 744 0xe8 : b'\xc3\xa8', # è 745 0xe9 : b'\xc3\xa9', # é 746 0xea : b'\xc3\xaa', # ê 747 0xeb : b'\xc3\xab', # ë 748 0xec : b'\xc3\xac', # ì 749 0xed : b'\xc3\xad', # í 750 0xee : b'\xc3\xae', # î 751 0xef : b'\xc3\xaf', # ï 752 0xf0 : b'\xc3\xb0', # ð 753 0xf1 : b'\xc3\xb1', # ñ 754 0xf2 : b'\xc3\xb2', # ò 755 0xf3 : b'\xc3\xb3', # ó 756 0xf4 : b'\xc3\xb4', # ô 757 0xf5 : b'\xc3\xb5', # õ 758 0xf6 : b'\xc3\xb6', # ö 759 0xf7 : b'\xc3\xb7', # ÷ 760 0xf8 : b'\xc3\xb8', # ø 761 0xf9 : b'\xc3\xb9', # ù 762 0xfa : b'\xc3\xba', # ú 763 0xfb : b'\xc3\xbb', # û 764 0xfc : b'\xc3\xbc', # ü 765 0xfd : b'\xc3\xbd', # ý 766 0xfe : b'\xc3\xbe', # þ 767 } 768 769 MULTIBYTE_MARKERS_AND_SIZES = [ 770 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 771 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 772 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 773 ] 774 775 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 776 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 777 778 @classmethod 779 def detwingle(cls, in_bytes, main_encoding="utf8", 780 embedded_encoding="windows-1252"): 781 """Fix characters from one encoding embedded in some other encoding. 782 783 Currently the only situation supported is Windows-1252 (or its 784 subset ISO-8859-1), embedded in UTF-8. 785 786 The input must be a bytestring. If you've already converted 787 the document to Unicode, you're too late. 788 789 The output is a bytestring in which `embedded_encoding` 790 characters have been converted to their `main_encoding` 791 equivalents. 792 """ 793 if embedded_encoding.replace('_', '-').lower() not in ( 794 'windows-1252', 'windows_1252'): 795 raise NotImplementedError( 796 "Windows-1252 and ISO-8859-1 are the only currently supported " 797 "embedded encodings.") 798 799 if main_encoding.lower() not in ('utf8', 'utf-8'): 800 raise NotImplementedError( 801 "UTF-8 is the only currently supported main encoding.") 802 803 byte_chunks = [] 804 805 chunk_start = 0 806 pos = 0 807 while pos < len(in_bytes): 808 byte = in_bytes[pos] 809 if not isinstance(byte, int): 810 # Python 2.x 811 byte = ord(byte) 812 if (byte >= cls.FIRST_MULTIBYTE_MARKER 813 and byte <= cls.LAST_MULTIBYTE_MARKER): 814 # This is the start of a UTF-8 multibyte character. Skip 815 # to the end. 816 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 817 if byte >= start and byte <= end: 818 pos += size 819 break 820 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 821 # We found a Windows-1252 character! 822 # Save the string up to this point as a chunk. 823 byte_chunks.append(in_bytes[chunk_start:pos]) 824 825 # Now translate the Windows-1252 character into UTF-8 826 # and add it as another, one-byte chunk. 827 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 828 pos += 1 829 chunk_start = pos 830 else: 831 # Go on to the next character. 832 pos += 1 833 if chunk_start == 0: 834 # The string is unchanged. 835 return in_bytes 836 else: 837 # Store the final chunk. 838 byte_chunks.append(in_bytes[chunk_start:]) 839 return b''.join(byte_chunks) 840 841