1# -*- coding: utf-8 -*- 2"""Beautiful Soup bonus library: Unicode, Dammit 3 4This library converts a bytestream to Unicode through any means 5necessary. It is heavily based on code from Mark Pilgrim's Universal 6Feed Parser. It works best on XML and HTML, but it does not rewrite the 7XML or HTML to reflect a new encoding; that's the tree builder's job. 8""" 9__license__ = "MIT" 10 11import codecs 12from html.entities import codepoint2name 13import re 14import logging 15 16# Import a library to autodetect character encodings. 17chardet_type = None 18try: 19 # First try the fast C implementation. 20 # PyPI package: cchardet 21 import cchardet 22 def chardet_dammit(s): 23 return cchardet.detect(s)['encoding'] 24except ImportError: 25 try: 26 # Fall back to the pure Python implementation 27 # Debian package: python-chardet 28 # PyPI package: chardet 29 import chardet 30 def chardet_dammit(s): 31 return chardet.detect(s)['encoding'] 32 #import chardet.constants 33 #chardet.constants._debug = 1 34 except ImportError: 35 # No chardet available. 36 def chardet_dammit(s): 37 return None 38 39xml_encoding_re = re.compile( 40 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 41html_meta_re = re.compile( 42 r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 43 44class EntitySubstitution(object): 45 46 """Substitute XML or HTML entities for the corresponding characters.""" 47 48 def _populate_class_variables(): 49 lookup = {} 50 reverse_lookup = {} 51 characters_for_re = [] 52 for codepoint, name in list(codepoint2name.items()): 53 character = chr(codepoint) 54 if codepoint != 34: 55 # There's no point in turning the quotation mark into 56 # ", unless it happens within an attribute value, which 57 # is handled elsewhere. 58 characters_for_re.append(character) 59 lookup[character] = name 60 # But we do want to turn " into the quotation mark. 61 reverse_lookup[name] = character 62 re_definition = "[%s]" % "".join(characters_for_re) 63 return lookup, reverse_lookup, re.compile(re_definition) 64 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 65 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 66 67 CHARACTER_TO_XML_ENTITY = { 68 "'": "apos", 69 '"': "quot", 70 "&": "amp", 71 "<": "lt", 72 ">": "gt", 73 } 74 75 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" 76 r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 77 r")") 78 79 AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") 80 81 @classmethod 82 def _substitute_html_entity(cls, matchobj): 83 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 84 return "&%s;" % entity 85 86 @classmethod 87 def _substitute_xml_entity(cls, matchobj): 88 """Used with a regular expression to substitute the 89 appropriate XML entity for an XML special character.""" 90 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 91 return "&%s;" % entity 92 93 @classmethod 94 def quoted_attribute_value(self, value): 95 """Make a value into a quoted XML attribute, possibly escaping it. 96 97 Most strings will be quoted using double quotes. 98 99 Bob's Bar -> "Bob's Bar" 100 101 If a string contains double quotes, it will be quoted using 102 single quotes. 103 104 Welcome to "my bar" -> 'Welcome to "my bar"' 105 106 If a string contains both single and double quotes, the 107 double quotes will be escaped, and the string will be quoted 108 using double quotes. 109 110 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 111 """ 112 quote_with = '"' 113 if '"' in value: 114 if "'" in value: 115 # The string contains both single and double 116 # quotes. Turn the double quotes into 117 # entities. We quote the double quotes rather than 118 # the single quotes because the entity name is 119 # """ whether this is HTML or XML. If we 120 # quoted the single quotes, we'd have to decide 121 # between ' and &squot;. 122 replace_with = """ 123 value = value.replace('"', replace_with) 124 else: 125 # There are double quotes but no single quotes. 126 # We can use single quotes to quote the attribute. 127 quote_with = "'" 128 return quote_with + value + quote_with 129 130 @classmethod 131 def substitute_xml(cls, value, make_quoted_attribute=False): 132 """Substitute XML entities for special XML characters. 133 134 :param value: A string to be substituted. The less-than sign 135 will become <, the greater-than sign will become >, 136 and any ampersands will become &. If you want ampersands 137 that appear to be part of an entity definition to be left 138 alone, use substitute_xml_containing_entities() instead. 139 140 :param make_quoted_attribute: If True, then the string will be 141 quoted, as befits an attribute value. 142 """ 143 # Escape angle brackets and ampersands. 144 value = cls.AMPERSAND_OR_BRACKET.sub( 145 cls._substitute_xml_entity, value) 146 147 if make_quoted_attribute: 148 value = cls.quoted_attribute_value(value) 149 return value 150 151 @classmethod 152 def substitute_xml_containing_entities( 153 cls, value, make_quoted_attribute=False): 154 """Substitute XML entities for special XML characters. 155 156 :param value: A string to be substituted. The less-than sign will 157 become <, the greater-than sign will become >, and any 158 ampersands that are not part of an entity defition will 159 become &. 160 161 :param make_quoted_attribute: If True, then the string will be 162 quoted, as befits an attribute value. 163 """ 164 # Escape angle brackets, and ampersands that aren't part of 165 # entities. 166 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 167 cls._substitute_xml_entity, value) 168 169 if make_quoted_attribute: 170 value = cls.quoted_attribute_value(value) 171 return value 172 173 @classmethod 174 def substitute_html(cls, s): 175 """Replace certain Unicode characters with named HTML entities. 176 177 This differs from data.encode(encoding, 'xmlcharrefreplace') 178 in that the goal is to make the result more readable (to those 179 with ASCII displays) rather than to recover from 180 errors. There's absolutely nothing wrong with a UTF-8 string 181 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 182 character with "é" will make it more readable to some 183 people. 184 """ 185 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 186 cls._substitute_html_entity, s) 187 188 189class EncodingDetector: 190 """Suggests a number of possible encodings for a bytestring. 191 192 Order of precedence: 193 194 1. Encodings you specifically tell EncodingDetector to try first 195 (the override_encodings argument to the constructor). 196 197 2. An encoding declared within the bytestring itself, either in an 198 XML declaration (if the bytestring is to be interpreted as an XML 199 document), or in a <meta> tag (if the bytestring is to be 200 interpreted as an HTML document.) 201 202 3. An encoding detected through textual analysis by chardet, 203 cchardet, or a similar external library. 204 205 4. UTF-8. 206 207 5. Windows-1252. 208 """ 209 def __init__(self, markup, override_encodings=None, is_html=False, 210 exclude_encodings=None): 211 self.override_encodings = override_encodings or [] 212 exclude_encodings = exclude_encodings or [] 213 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 214 self.chardet_encoding = None 215 self.is_html = is_html 216 self.declared_encoding = None 217 218 # First order of business: strip a byte-order mark. 219 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 220 221 def _usable(self, encoding, tried): 222 if encoding is not None: 223 encoding = encoding.lower() 224 if encoding in self.exclude_encodings: 225 return False 226 if encoding not in tried: 227 tried.add(encoding) 228 return True 229 return False 230 231 @property 232 def encodings(self): 233 """Yield a number of encodings that might work for this markup.""" 234 tried = set() 235 for e in self.override_encodings: 236 if self._usable(e, tried): 237 yield e 238 239 # Did the document originally start with a byte-order mark 240 # that indicated its encoding? 241 if self._usable(self.sniffed_encoding, tried): 242 yield self.sniffed_encoding 243 244 # Look within the document for an XML or HTML encoding 245 # declaration. 246 if self.declared_encoding is None: 247 self.declared_encoding = self.find_declared_encoding( 248 self.markup, self.is_html) 249 if self._usable(self.declared_encoding, tried): 250 yield self.declared_encoding 251 252 # Use third-party character set detection to guess at the 253 # encoding. 254 if self.chardet_encoding is None: 255 self.chardet_encoding = chardet_dammit(self.markup) 256 if self._usable(self.chardet_encoding, tried): 257 yield self.chardet_encoding 258 259 # As a last-ditch effort, try utf-8 and windows-1252. 260 for e in ('utf-8', 'windows-1252'): 261 if self._usable(e, tried): 262 yield e 263 264 @classmethod 265 def strip_byte_order_mark(cls, data): 266 """If a byte-order mark is present, strip it and return the encoding it implies.""" 267 encoding = None 268 if isinstance(data, str): 269 # Unicode data cannot have a byte-order mark. 270 return data, encoding 271 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 272 and (data[2:4] != '\x00\x00'): 273 encoding = 'utf-16be' 274 data = data[2:] 275 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 276 and (data[2:4] != '\x00\x00'): 277 encoding = 'utf-16le' 278 data = data[2:] 279 elif data[:3] == b'\xef\xbb\xbf': 280 encoding = 'utf-8' 281 data = data[3:] 282 elif data[:4] == b'\x00\x00\xfe\xff': 283 encoding = 'utf-32be' 284 data = data[4:] 285 elif data[:4] == b'\xff\xfe\x00\x00': 286 encoding = 'utf-32le' 287 data = data[4:] 288 return data, encoding 289 290 @classmethod 291 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 292 """Given a document, tries to find its declared encoding. 293 294 An XML encoding is declared at the beginning of the document. 295 296 An HTML encoding is declared in a <meta> tag, hopefully near the 297 beginning of the document. 298 """ 299 if search_entire_document: 300 xml_endpos = html_endpos = len(markup) 301 else: 302 xml_endpos = 1024 303 html_endpos = max(2048, int(len(markup) * 0.05)) 304 305 declared_encoding = None 306 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 307 if not declared_encoding_match and is_html: 308 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 309 if declared_encoding_match is not None: 310 declared_encoding = declared_encoding_match.groups()[0].decode( 311 'ascii', 'replace') 312 if declared_encoding: 313 return declared_encoding.lower() 314 return None 315 316class UnicodeDammit: 317 """A class for detecting the encoding of a *ML document and 318 converting it to a Unicode string. If the source encoding is 319 windows-1252, can replace MS smart quotes with their HTML or XML 320 equivalents.""" 321 322 # This dictionary maps commonly seen values for "charset" in HTML 323 # meta tags to the corresponding Python codec names. It only covers 324 # values that aren't in Python's aliases and can't be determined 325 # by the heuristics in find_codec. 326 CHARSET_ALIASES = {"macintosh": "mac-roman", 327 "x-sjis": "shift-jis"} 328 329 ENCODINGS_WITH_SMART_QUOTES = [ 330 "windows-1252", 331 "iso-8859-1", 332 "iso-8859-2", 333 ] 334 335 def __init__(self, markup, override_encodings=[], 336 smart_quotes_to=None, is_html=False, exclude_encodings=[]): 337 self.smart_quotes_to = smart_quotes_to 338 self.tried_encodings = [] 339 self.contains_replacement_characters = False 340 self.is_html = is_html 341 342 self.detector = EncodingDetector( 343 markup, override_encodings, is_html, exclude_encodings) 344 345 # Short-circuit if the data is in Unicode to begin with. 346 if isinstance(markup, str) or markup == '': 347 self.markup = markup 348 self.unicode_markup = str(markup) 349 self.original_encoding = None 350 return 351 352 # The encoding detector may have stripped a byte-order mark. 353 # Use the stripped markup from this point on. 354 self.markup = self.detector.markup 355 356 u = None 357 for encoding in self.detector.encodings: 358 markup = self.detector.markup 359 u = self._convert_from(encoding) 360 if u is not None: 361 break 362 363 if not u: 364 # None of the encodings worked. As an absolute last resort, 365 # try them again with character replacement. 366 367 for encoding in self.detector.encodings: 368 if encoding != "ascii": 369 u = self._convert_from(encoding, "replace") 370 if u is not None: 371 logging.warning( 372 "Some characters could not be decoded, and were " 373 "replaced with REPLACEMENT CHARACTER.") 374 self.contains_replacement_characters = True 375 break 376 377 # If none of that worked, we could at this point force it to 378 # ASCII, but that would destroy so much data that I think 379 # giving up is better. 380 self.unicode_markup = u 381 if not u: 382 self.original_encoding = None 383 384 def _sub_ms_char(self, match): 385 """Changes a MS smart quote character to an XML or HTML 386 entity, or an ASCII character.""" 387 orig = match.group(1) 388 if self.smart_quotes_to == 'ascii': 389 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 390 else: 391 sub = self.MS_CHARS.get(orig) 392 if type(sub) == tuple: 393 if self.smart_quotes_to == 'xml': 394 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 395 else: 396 sub = '&'.encode() + sub[0].encode() + ';'.encode() 397 else: 398 sub = sub.encode() 399 return sub 400 401 def _convert_from(self, proposed, errors="strict"): 402 proposed = self.find_codec(proposed) 403 if not proposed or (proposed, errors) in self.tried_encodings: 404 return None 405 self.tried_encodings.append((proposed, errors)) 406 markup = self.markup 407 # Convert smart quotes to HTML if coming from an encoding 408 # that might have them. 409 if (self.smart_quotes_to is not None 410 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 411 smart_quotes_re = b"([\x80-\x9f])" 412 smart_quotes_compiled = re.compile(smart_quotes_re) 413 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 414 415 try: 416 #print "Trying to convert document to %s (errors=%s)" % ( 417 # proposed, errors) 418 u = self._to_unicode(markup, proposed, errors) 419 self.markup = u 420 self.original_encoding = proposed 421 except Exception as e: 422 #print "That didn't work!" 423 #print e 424 return None 425 #print "Correct encoding: %s" % proposed 426 return self.markup 427 428 def _to_unicode(self, data, encoding, errors="strict"): 429 '''Given a string and its encoding, decodes the string into Unicode. 430 %encoding is a string recognized by encodings.aliases''' 431 return str(data, encoding, errors) 432 433 @property 434 def declared_html_encoding(self): 435 if not self.is_html: 436 return None 437 return self.detector.declared_encoding 438 439 def find_codec(self, charset): 440 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 441 or (charset and self._codec(charset.replace("-", ""))) 442 or (charset and self._codec(charset.replace("-", "_"))) 443 or (charset and charset.lower()) 444 or charset 445 ) 446 if value: 447 return value.lower() 448 return None 449 450 def _codec(self, charset): 451 if not charset: 452 return charset 453 codec = None 454 try: 455 codecs.lookup(charset) 456 codec = charset 457 except (LookupError, ValueError): 458 pass 459 return codec 460 461 462 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 463 MS_CHARS = {b'\x80': ('euro', '20AC'), 464 b'\x81': ' ', 465 b'\x82': ('sbquo', '201A'), 466 b'\x83': ('fnof', '192'), 467 b'\x84': ('bdquo', '201E'), 468 b'\x85': ('hellip', '2026'), 469 b'\x86': ('dagger', '2020'), 470 b'\x87': ('Dagger', '2021'), 471 b'\x88': ('circ', '2C6'), 472 b'\x89': ('permil', '2030'), 473 b'\x8A': ('Scaron', '160'), 474 b'\x8B': ('lsaquo', '2039'), 475 b'\x8C': ('OElig', '152'), 476 b'\x8D': '?', 477 b'\x8E': ('#x17D', '17D'), 478 b'\x8F': '?', 479 b'\x90': '?', 480 b'\x91': ('lsquo', '2018'), 481 b'\x92': ('rsquo', '2019'), 482 b'\x93': ('ldquo', '201C'), 483 b'\x94': ('rdquo', '201D'), 484 b'\x95': ('bull', '2022'), 485 b'\x96': ('ndash', '2013'), 486 b'\x97': ('mdash', '2014'), 487 b'\x98': ('tilde', '2DC'), 488 b'\x99': ('trade', '2122'), 489 b'\x9a': ('scaron', '161'), 490 b'\x9b': ('rsaquo', '203A'), 491 b'\x9c': ('oelig', '153'), 492 b'\x9d': '?', 493 b'\x9e': ('#x17E', '17E'), 494 b'\x9f': ('Yuml', ''),} 495 496 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 497 # horrors like stripping diacritical marks to turn á into a, but also 498 # contains non-horrors like turning “ into ". 499 MS_CHARS_TO_ASCII = { 500 b'\x80' : 'EUR', 501 b'\x81' : ' ', 502 b'\x82' : ',', 503 b'\x83' : 'f', 504 b'\x84' : ',,', 505 b'\x85' : '...', 506 b'\x86' : '+', 507 b'\x87' : '++', 508 b'\x88' : '^', 509 b'\x89' : '%', 510 b'\x8a' : 'S', 511 b'\x8b' : '<', 512 b'\x8c' : 'OE', 513 b'\x8d' : '?', 514 b'\x8e' : 'Z', 515 b'\x8f' : '?', 516 b'\x90' : '?', 517 b'\x91' : "'", 518 b'\x92' : "'", 519 b'\x93' : '"', 520 b'\x94' : '"', 521 b'\x95' : '*', 522 b'\x96' : '-', 523 b'\x97' : '--', 524 b'\x98' : '~', 525 b'\x99' : '(TM)', 526 b'\x9a' : 's', 527 b'\x9b' : '>', 528 b'\x9c' : 'oe', 529 b'\x9d' : '?', 530 b'\x9e' : 'z', 531 b'\x9f' : 'Y', 532 b'\xa0' : ' ', 533 b'\xa1' : '!', 534 b'\xa2' : 'c', 535 b'\xa3' : 'GBP', 536 b'\xa4' : '$', #This approximation is especially parochial--this is the 537 #generic currency symbol. 538 b'\xa5' : 'YEN', 539 b'\xa6' : '|', 540 b'\xa7' : 'S', 541 b'\xa8' : '..', 542 b'\xa9' : '', 543 b'\xaa' : '(th)', 544 b'\xab' : '<<', 545 b'\xac' : '!', 546 b'\xad' : ' ', 547 b'\xae' : '(R)', 548 b'\xaf' : '-', 549 b'\xb0' : 'o', 550 b'\xb1' : '+-', 551 b'\xb2' : '2', 552 b'\xb3' : '3', 553 b'\xb4' : ("'", 'acute'), 554 b'\xb5' : 'u', 555 b'\xb6' : 'P', 556 b'\xb7' : '*', 557 b'\xb8' : ',', 558 b'\xb9' : '1', 559 b'\xba' : '(th)', 560 b'\xbb' : '>>', 561 b'\xbc' : '1/4', 562 b'\xbd' : '1/2', 563 b'\xbe' : '3/4', 564 b'\xbf' : '?', 565 b'\xc0' : 'A', 566 b'\xc1' : 'A', 567 b'\xc2' : 'A', 568 b'\xc3' : 'A', 569 b'\xc4' : 'A', 570 b'\xc5' : 'A', 571 b'\xc6' : 'AE', 572 b'\xc7' : 'C', 573 b'\xc8' : 'E', 574 b'\xc9' : 'E', 575 b'\xca' : 'E', 576 b'\xcb' : 'E', 577 b'\xcc' : 'I', 578 b'\xcd' : 'I', 579 b'\xce' : 'I', 580 b'\xcf' : 'I', 581 b'\xd0' : 'D', 582 b'\xd1' : 'N', 583 b'\xd2' : 'O', 584 b'\xd3' : 'O', 585 b'\xd4' : 'O', 586 b'\xd5' : 'O', 587 b'\xd6' : 'O', 588 b'\xd7' : '*', 589 b'\xd8' : 'O', 590 b'\xd9' : 'U', 591 b'\xda' : 'U', 592 b'\xdb' : 'U', 593 b'\xdc' : 'U', 594 b'\xdd' : 'Y', 595 b'\xde' : 'b', 596 b'\xdf' : 'B', 597 b'\xe0' : 'a', 598 b'\xe1' : 'a', 599 b'\xe2' : 'a', 600 b'\xe3' : 'a', 601 b'\xe4' : 'a', 602 b'\xe5' : 'a', 603 b'\xe6' : 'ae', 604 b'\xe7' : 'c', 605 b'\xe8' : 'e', 606 b'\xe9' : 'e', 607 b'\xea' : 'e', 608 b'\xeb' : 'e', 609 b'\xec' : 'i', 610 b'\xed' : 'i', 611 b'\xee' : 'i', 612 b'\xef' : 'i', 613 b'\xf0' : 'o', 614 b'\xf1' : 'n', 615 b'\xf2' : 'o', 616 b'\xf3' : 'o', 617 b'\xf4' : 'o', 618 b'\xf5' : 'o', 619 b'\xf6' : 'o', 620 b'\xf7' : '/', 621 b'\xf8' : 'o', 622 b'\xf9' : 'u', 623 b'\xfa' : 'u', 624 b'\xfb' : 'u', 625 b'\xfc' : 'u', 626 b'\xfd' : 'y', 627 b'\xfe' : 'b', 628 b'\xff' : 'y', 629 } 630 631 # A map used when removing rogue Windows-1252/ISO-8859-1 632 # characters in otherwise UTF-8 documents. 633 # 634 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 635 # Windows-1252. 636 WINDOWS_1252_TO_UTF8 = { 637 0x80 : b'\xe2\x82\xac', # € 638 0x82 : b'\xe2\x80\x9a', # ‚ 639 0x83 : b'\xc6\x92', # ƒ 640 0x84 : b'\xe2\x80\x9e', # „ 641 0x85 : b'\xe2\x80\xa6', # … 642 0x86 : b'\xe2\x80\xa0', # † 643 0x87 : b'\xe2\x80\xa1', # ‡ 644 0x88 : b'\xcb\x86', # ˆ 645 0x89 : b'\xe2\x80\xb0', # ‰ 646 0x8a : b'\xc5\xa0', # Š 647 0x8b : b'\xe2\x80\xb9', # ‹ 648 0x8c : b'\xc5\x92', # Œ 649 0x8e : b'\xc5\xbd', # Ž 650 0x91 : b'\xe2\x80\x98', # ‘ 651 0x92 : b'\xe2\x80\x99', # ’ 652 0x93 : b'\xe2\x80\x9c', # “ 653 0x94 : b'\xe2\x80\x9d', # ” 654 0x95 : b'\xe2\x80\xa2', # • 655 0x96 : b'\xe2\x80\x93', # – 656 0x97 : b'\xe2\x80\x94', # — 657 0x98 : b'\xcb\x9c', # ˜ 658 0x99 : b'\xe2\x84\xa2', # ™ 659 0x9a : b'\xc5\xa1', # š 660 0x9b : b'\xe2\x80\xba', # › 661 0x9c : b'\xc5\x93', # œ 662 0x9e : b'\xc5\xbe', # ž 663 0x9f : b'\xc5\xb8', # Ÿ 664 0xa0 : b'\xc2\xa0', # 665 0xa1 : b'\xc2\xa1', # ¡ 666 0xa2 : b'\xc2\xa2', # ¢ 667 0xa3 : b'\xc2\xa3', # £ 668 0xa4 : b'\xc2\xa4', # ¤ 669 0xa5 : b'\xc2\xa5', # ¥ 670 0xa6 : b'\xc2\xa6', # ¦ 671 0xa7 : b'\xc2\xa7', # § 672 0xa8 : b'\xc2\xa8', # ¨ 673 0xa9 : b'\xc2\xa9', # © 674 0xaa : b'\xc2\xaa', # ª 675 0xab : b'\xc2\xab', # « 676 0xac : b'\xc2\xac', # ¬ 677 0xad : b'\xc2\xad', # 678 0xae : b'\xc2\xae', # ® 679 0xaf : b'\xc2\xaf', # ¯ 680 0xb0 : b'\xc2\xb0', # ° 681 0xb1 : b'\xc2\xb1', # ± 682 0xb2 : b'\xc2\xb2', # ² 683 0xb3 : b'\xc2\xb3', # ³ 684 0xb4 : b'\xc2\xb4', # ´ 685 0xb5 : b'\xc2\xb5', # µ 686 0xb6 : b'\xc2\xb6', # ¶ 687 0xb7 : b'\xc2\xb7', # · 688 0xb8 : b'\xc2\xb8', # ¸ 689 0xb9 : b'\xc2\xb9', # ¹ 690 0xba : b'\xc2\xba', # º 691 0xbb : b'\xc2\xbb', # » 692 0xbc : b'\xc2\xbc', # ¼ 693 0xbd : b'\xc2\xbd', # ½ 694 0xbe : b'\xc2\xbe', # ¾ 695 0xbf : b'\xc2\xbf', # ¿ 696 0xc0 : b'\xc3\x80', # À 697 0xc1 : b'\xc3\x81', # Á 698 0xc2 : b'\xc3\x82', #  699 0xc3 : b'\xc3\x83', # à 700 0xc4 : b'\xc3\x84', # Ä 701 0xc5 : b'\xc3\x85', # Å 702 0xc6 : b'\xc3\x86', # Æ 703 0xc7 : b'\xc3\x87', # Ç 704 0xc8 : b'\xc3\x88', # È 705 0xc9 : b'\xc3\x89', # É 706 0xca : b'\xc3\x8a', # Ê 707 0xcb : b'\xc3\x8b', # Ë 708 0xcc : b'\xc3\x8c', # Ì 709 0xcd : b'\xc3\x8d', # Í 710 0xce : b'\xc3\x8e', # Î 711 0xcf : b'\xc3\x8f', # Ï 712 0xd0 : b'\xc3\x90', # Ð 713 0xd1 : b'\xc3\x91', # Ñ 714 0xd2 : b'\xc3\x92', # Ò 715 0xd3 : b'\xc3\x93', # Ó 716 0xd4 : b'\xc3\x94', # Ô 717 0xd5 : b'\xc3\x95', # Õ 718 0xd6 : b'\xc3\x96', # Ö 719 0xd7 : b'\xc3\x97', # × 720 0xd8 : b'\xc3\x98', # Ø 721 0xd9 : b'\xc3\x99', # Ù 722 0xda : b'\xc3\x9a', # Ú 723 0xdb : b'\xc3\x9b', # Û 724 0xdc : b'\xc3\x9c', # Ü 725 0xdd : b'\xc3\x9d', # Ý 726 0xde : b'\xc3\x9e', # Þ 727 0xdf : b'\xc3\x9f', # ß 728 0xe0 : b'\xc3\xa0', # à 729 0xe1 : b'\xa1', # á 730 0xe2 : b'\xc3\xa2', # â 731 0xe3 : b'\xc3\xa3', # ã 732 0xe4 : b'\xc3\xa4', # ä 733 0xe5 : b'\xc3\xa5', # å 734 0xe6 : b'\xc3\xa6', # æ 735 0xe7 : b'\xc3\xa7', # ç 736 0xe8 : b'\xc3\xa8', # è 737 0xe9 : b'\xc3\xa9', # é 738 0xea : b'\xc3\xaa', # ê 739 0xeb : b'\xc3\xab', # ë 740 0xec : b'\xc3\xac', # ì 741 0xed : b'\xc3\xad', # í 742 0xee : b'\xc3\xae', # î 743 0xef : b'\xc3\xaf', # ï 744 0xf0 : b'\xc3\xb0', # ð 745 0xf1 : b'\xc3\xb1', # ñ 746 0xf2 : b'\xc3\xb2', # ò 747 0xf3 : b'\xc3\xb3', # ó 748 0xf4 : b'\xc3\xb4', # ô 749 0xf5 : b'\xc3\xb5', # õ 750 0xf6 : b'\xc3\xb6', # ö 751 0xf7 : b'\xc3\xb7', # ÷ 752 0xf8 : b'\xc3\xb8', # ø 753 0xf9 : b'\xc3\xb9', # ù 754 0xfa : b'\xc3\xba', # ú 755 0xfb : b'\xc3\xbb', # û 756 0xfc : b'\xc3\xbc', # ü 757 0xfd : b'\xc3\xbd', # ý 758 0xfe : b'\xc3\xbe', # þ 759 } 760 761 MULTIBYTE_MARKERS_AND_SIZES = [ 762 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 763 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 764 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 765 ] 766 767 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 768 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 769 770 @classmethod 771 def detwingle(cls, in_bytes, main_encoding="utf8", 772 embedded_encoding="windows-1252"): 773 """Fix characters from one encoding embedded in some other encoding. 774 775 Currently the only situation supported is Windows-1252 (or its 776 subset ISO-8859-1), embedded in UTF-8. 777 778 The input must be a bytestring. If you've already converted 779 the document to Unicode, you're too late. 780 781 The output is a bytestring in which `embedded_encoding` 782 characters have been converted to their `main_encoding` 783 equivalents. 784 """ 785 if embedded_encoding.replace('_', '-').lower() not in ( 786 'windows-1252', 'windows_1252'): 787 raise NotImplementedError( 788 "Windows-1252 and ISO-8859-1 are the only currently supported " 789 "embedded encodings.") 790 791 if main_encoding.lower() not in ('utf8', 'utf-8'): 792 raise NotImplementedError( 793 "UTF-8 is the only currently supported main encoding.") 794 795 byte_chunks = [] 796 797 chunk_start = 0 798 pos = 0 799 while pos < len(in_bytes): 800 byte = in_bytes[pos] 801 if not isinstance(byte, int): 802 # Python 2.x 803 byte = ord(byte) 804 if (byte >= cls.FIRST_MULTIBYTE_MARKER 805 and byte <= cls.LAST_MULTIBYTE_MARKER): 806 # This is the start of a UTF-8 multibyte character. Skip 807 # to the end. 808 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 809 if byte >= start and byte <= end: 810 pos += size 811 break 812 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 813 # We found a Windows-1252 character! 814 # Save the string up to this point as a chunk. 815 byte_chunks.append(in_bytes[chunk_start:pos]) 816 817 # Now translate the Windows-1252 character into UTF-8 818 # and add it as another, one-byte chunk. 819 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 820 pos += 1 821 chunk_start = pos 822 else: 823 # Go on to the next character. 824 pos += 1 825 if chunk_start == 0: 826 # The string is unchanged. 827 return in_bytes 828 else: 829 # Store the final chunk. 830 byte_chunks.append(in_bytes[chunk_start:]) 831 return b''.join(byte_chunks) 832 833