1# -*- coding: utf-8 -*- 2"""Beautiful Soup bonus library: Unicode, Dammit 3 4This library converts a bytestream to Unicode through any means 5necessary. It is heavily based on code from Mark Pilgrim's Universal 6Feed Parser. It works best on XML and HTML, but it does not rewrite the 7XML or HTML to reflect a new encoding; that's the tree builder's job. 8""" 9# Use of this source code is governed by the MIT license. 10__license__ = "MIT" 11 12from html.entities import codepoint2name 13from collections import defaultdict 14import codecs 15import re 16import logging 17import string 18 19# Import a library to autodetect character encodings. We'll support 20# any of a number of libraries that all support the same API: 21# 22# * cchardet 23# * chardet 24# * charset-normalizer 25chardet_module = None 26try: 27 # PyPI package: cchardet 28 import cchardet as chardet_module 29except ImportError: 30 try: 31 # Debian package: python-chardet 32 # PyPI package: chardet 33 import chardet as chardet_module 34 except ImportError: 35 try: 36 # PyPI package: charset-normalizer 37 import charset_normalizer as chardet_module 38 except ImportError: 39 # No chardet available. 40 chardet_module = None 41 42if chardet_module: 43 def chardet_dammit(s): 44 if isinstance(s, str): 45 return None 46 return chardet_module.detect(s)['encoding'] 47else: 48 def chardet_dammit(s): 49 return None 50 51# Build bytestring and Unicode versions of regular expressions for finding 52# a declared encoding inside an XML or HTML document. 53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' 54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' 55encoding_res = dict() 56encoding_res[bytes] = { 57 'html' : re.compile(html_meta.encode("ascii"), re.I), 58 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), 59} 60encoding_res[str] = { 61 'html' : re.compile(html_meta, re.I), 62 'xml' : re.compile(xml_encoding, re.I) 63} 64 65from html.entities import html5 66 67class EntitySubstitution(object): 68 """The ability to substitute XML or HTML entities for certain characters.""" 69 70 def _populate_class_variables(): 71 """Initialize variables used by this class to manage the plethora of 72 HTML5 named entities. 73 74 This function returns a 3-tuple containing two dictionaries 75 and a regular expression: 76 77 unicode_to_name - A mapping of Unicode strings like "⦨" to 78 entity names like "angmsdaa". When a single Unicode string has 79 multiple entity names, we try to choose the most commonly-used 80 name. 81 82 name_to_unicode: A mapping of entity names like "angmsdaa" to 83 Unicode strings like "⦨". 84 85 named_entity_re: A regular expression matching (almost) any 86 Unicode string that corresponds to an HTML5 named entity. 87 """ 88 unicode_to_name = {} 89 name_to_unicode = {} 90 91 short_entities = set() 92 long_entities_by_first_character = defaultdict(set) 93 94 for name_with_semicolon, character in sorted(html5.items()): 95 # "It is intentional, for legacy compatibility, that many 96 # code points have multiple character reference names. For 97 # example, some appear both with and without the trailing 98 # semicolon, or with different capitalizations." 99 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references 100 # 101 # The parsers are in charge of handling (or not) character 102 # references with no trailing semicolon, so we remove the 103 # semicolon whenever it appears. 104 if name_with_semicolon.endswith(';'): 105 name = name_with_semicolon[:-1] 106 else: 107 name = name_with_semicolon 108 109 # When parsing HTML, we want to recognize any known named 110 # entity and convert it to a sequence of Unicode 111 # characters. 112 if name not in name_to_unicode: 113 name_to_unicode[name] = character 114 115 # When _generating_ HTML, we want to recognize special 116 # character sequences that _could_ be converted to named 117 # entities. 118 unicode_to_name[character] = name 119 120 # We also need to build a regular expression that lets us 121 # _find_ those characters in output strings so we can 122 # replace them. 123 # 124 # This is tricky, for two reasons. 125 126 if (len(character) == 1 and ord(character) < 128 127 and character not in '<>&'): 128 # First, it would be annoying to turn single ASCII 129 # characters like | into named entities like 130 # |. The exceptions are <>&, which we _must_ 131 # turn into named entities to produce valid HTML. 132 continue 133 134 if len(character) > 1 and all(ord(x) < 128 for x in character): 135 # We also do not want to turn _combinations_ of ASCII 136 # characters like 'fj' into named entities like 'fj', 137 # though that's more debateable. 138 continue 139 140 # Second, some named entities have a Unicode value that's 141 # a subset of the Unicode value for some _other_ named 142 # entity. As an example, \u2267' is ≧, 143 # but '\u2267\u0338' is ≧̸. Our regular 144 # expression needs to match the first two characters of 145 # "\u2267\u0338foo", but only the first character of 146 # "\u2267foo". 147 # 148 # In this step, we build two sets of characters that 149 # _eventually_ need to go into the regular expression. But 150 # we won't know exactly what the regular expression needs 151 # to look like until we've gone through the entire list of 152 # named entities. 153 if len(character) == 1: 154 short_entities.add(character) 155 else: 156 long_entities_by_first_character[character[0]].add(character) 157 158 # Now that we've been through the entire list of entities, we 159 # can create a regular expression that matches any of them. 160 particles = set() 161 for short in short_entities: 162 long_versions = long_entities_by_first_character[short] 163 if not long_versions: 164 particles.add(short) 165 else: 166 ignore = "".join([x[1] for x in long_versions]) 167 # This finds, e.g. \u2267 but only if it is _not_ 168 # followed by \u0338. 169 particles.add("%s(?![%s])" % (short, ignore)) 170 171 for long_entities in list(long_entities_by_first_character.values()): 172 for long_entity in long_entities: 173 particles.add(long_entity) 174 175 re_definition = "(%s)" % "|".join(particles) 176 177 # If an entity shows up in both html5 and codepoint2name, it's 178 # likely that HTML5 gives it several different names, such as 179 # 'rsquo' and 'rsquor'. When converting Unicode characters to 180 # named entities, the codepoint2name name should take 181 # precedence where possible, since that's the more easily 182 # recognizable one. 183 for codepoint, name in list(codepoint2name.items()): 184 character = chr(codepoint) 185 unicode_to_name[character] = name 186 187 return unicode_to_name, name_to_unicode, re.compile(re_definition) 188 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 189 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 190 191 CHARACTER_TO_XML_ENTITY = { 192 "'": "apos", 193 '"': "quot", 194 "&": "amp", 195 "<": "lt", 196 ">": "gt", 197 } 198 199 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 200 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" 201 ")") 202 203 AMPERSAND_OR_BRACKET = re.compile("([<>&])") 204 205 @classmethod 206 def _substitute_html_entity(cls, matchobj): 207 """Used with a regular expression to substitute the 208 appropriate HTML entity for a special character string.""" 209 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 210 return "&%s;" % entity 211 212 @classmethod 213 def _substitute_xml_entity(cls, matchobj): 214 """Used with a regular expression to substitute the 215 appropriate XML entity for a special character string.""" 216 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 217 return "&%s;" % entity 218 219 @classmethod 220 def quoted_attribute_value(self, value): 221 """Make a value into a quoted XML attribute, possibly escaping it. 222 223 Most strings will be quoted using double quotes. 224 225 Bob's Bar -> "Bob's Bar" 226 227 If a string contains double quotes, it will be quoted using 228 single quotes. 229 230 Welcome to "my bar" -> 'Welcome to "my bar"' 231 232 If a string contains both single and double quotes, the 233 double quotes will be escaped, and the string will be quoted 234 using double quotes. 235 236 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 237 """ 238 quote_with = '"' 239 if '"' in value: 240 if "'" in value: 241 # The string contains both single and double 242 # quotes. Turn the double quotes into 243 # entities. We quote the double quotes rather than 244 # the single quotes because the entity name is 245 # """ whether this is HTML or XML. If we 246 # quoted the single quotes, we'd have to decide 247 # between ' and &squot;. 248 replace_with = """ 249 value = value.replace('"', replace_with) 250 else: 251 # There are double quotes but no single quotes. 252 # We can use single quotes to quote the attribute. 253 quote_with = "'" 254 return quote_with + value + quote_with 255 256 @classmethod 257 def substitute_xml(cls, value, make_quoted_attribute=False): 258 """Substitute XML entities for special XML characters. 259 260 :param value: A string to be substituted. The less-than sign 261 will become <, the greater-than sign will become >, 262 and any ampersands will become &. If you want ampersands 263 that appear to be part of an entity definition to be left 264 alone, use substitute_xml_containing_entities() instead. 265 266 :param make_quoted_attribute: If True, then the string will be 267 quoted, as befits an attribute value. 268 """ 269 # Escape angle brackets and ampersands. 270 value = cls.AMPERSAND_OR_BRACKET.sub( 271 cls._substitute_xml_entity, value) 272 273 if make_quoted_attribute: 274 value = cls.quoted_attribute_value(value) 275 return value 276 277 @classmethod 278 def substitute_xml_containing_entities( 279 cls, value, make_quoted_attribute=False): 280 """Substitute XML entities for special XML characters. 281 282 :param value: A string to be substituted. The less-than sign will 283 become <, the greater-than sign will become >, and any 284 ampersands that are not part of an entity defition will 285 become &. 286 287 :param make_quoted_attribute: If True, then the string will be 288 quoted, as befits an attribute value. 289 """ 290 # Escape angle brackets, and ampersands that aren't part of 291 # entities. 292 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 293 cls._substitute_xml_entity, value) 294 295 if make_quoted_attribute: 296 value = cls.quoted_attribute_value(value) 297 return value 298 299 @classmethod 300 def substitute_html(cls, s): 301 """Replace certain Unicode characters with named HTML entities. 302 303 This differs from data.encode(encoding, 'xmlcharrefreplace') 304 in that the goal is to make the result more readable (to those 305 with ASCII displays) rather than to recover from 306 errors. There's absolutely nothing wrong with a UTF-8 string 307 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 308 character with "é" will make it more readable to some 309 people. 310 311 :param s: A Unicode string. 312 """ 313 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 314 cls._substitute_html_entity, s) 315 316 317class EncodingDetector: 318 """Suggests a number of possible encodings for a bytestring. 319 320 Order of precedence: 321 322 1. Encodings you specifically tell EncodingDetector to try first 323 (the known_definite_encodings argument to the constructor). 324 325 2. An encoding determined by sniffing the document's byte-order mark. 326 327 3. Encodings you specifically tell EncodingDetector to try if 328 byte-order mark sniffing fails (the user_encodings argument to the 329 constructor). 330 331 4. An encoding declared within the bytestring itself, either in an 332 XML declaration (if the bytestring is to be interpreted as an XML 333 document), or in a <meta> tag (if the bytestring is to be 334 interpreted as an HTML document.) 335 336 5. An encoding detected through textual analysis by chardet, 337 cchardet, or a similar external library. 338 339 4. UTF-8. 340 341 5. Windows-1252. 342 343 """ 344 def __init__(self, markup, known_definite_encodings=None, 345 is_html=False, exclude_encodings=None, 346 user_encodings=None, override_encodings=None): 347 """Constructor. 348 349 :param markup: Some markup in an unknown encoding. 350 351 :param known_definite_encodings: When determining the encoding 352 of `markup`, these encodings will be tried first, in 353 order. In HTML terms, this corresponds to the "known 354 definite encoding" step defined here: 355 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding 356 357 :param user_encodings: These encodings will be tried after the 358 `known_definite_encodings` have been tried and failed, and 359 after an attempt to sniff the encoding by looking at a 360 byte order mark has failed. In HTML terms, this 361 corresponds to the step "user has explicitly instructed 362 the user agent to override the document's character 363 encoding", defined here: 364 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding 365 366 :param override_encodings: A deprecated alias for 367 known_definite_encodings. Any encodings here will be tried 368 immediately after the encodings in 369 known_definite_encodings. 370 371 :param is_html: If True, this markup is considered to be 372 HTML. Otherwise it's assumed to be XML. 373 374 :param exclude_encodings: These encodings will not be tried, 375 even if they otherwise would be. 376 377 """ 378 self.known_definite_encodings = list(known_definite_encodings or []) 379 if override_encodings: 380 self.known_definite_encodings += override_encodings 381 self.user_encodings = user_encodings or [] 382 exclude_encodings = exclude_encodings or [] 383 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 384 self.chardet_encoding = None 385 self.is_html = is_html 386 self.declared_encoding = None 387 388 # First order of business: strip a byte-order mark. 389 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 390 391 def _usable(self, encoding, tried): 392 """Should we even bother to try this encoding? 393 394 :param encoding: Name of an encoding. 395 :param tried: Encodings that have already been tried. This will be modified 396 as a side effect. 397 """ 398 if encoding is not None: 399 encoding = encoding.lower() 400 if encoding in self.exclude_encodings: 401 return False 402 if encoding not in tried: 403 tried.add(encoding) 404 return True 405 return False 406 407 @property 408 def encodings(self): 409 """Yield a number of encodings that might work for this markup. 410 411 :yield: A sequence of strings. 412 """ 413 tried = set() 414 415 # First, try the known definite encodings 416 for e in self.known_definite_encodings: 417 if self._usable(e, tried): 418 yield e 419 420 # Did the document originally start with a byte-order mark 421 # that indicated its encoding? 422 if self._usable(self.sniffed_encoding, tried): 423 yield self.sniffed_encoding 424 425 # Sniffing the byte-order mark did nothing; try the user 426 # encodings. 427 for e in self.user_encodings: 428 if self._usable(e, tried): 429 yield e 430 431 # Look within the document for an XML or HTML encoding 432 # declaration. 433 if self.declared_encoding is None: 434 self.declared_encoding = self.find_declared_encoding( 435 self.markup, self.is_html) 436 if self._usable(self.declared_encoding, tried): 437 yield self.declared_encoding 438 439 # Use third-party character set detection to guess at the 440 # encoding. 441 if self.chardet_encoding is None: 442 self.chardet_encoding = chardet_dammit(self.markup) 443 if self._usable(self.chardet_encoding, tried): 444 yield self.chardet_encoding 445 446 # As a last-ditch effort, try utf-8 and windows-1252. 447 for e in ('utf-8', 'windows-1252'): 448 if self._usable(e, tried): 449 yield e 450 451 @classmethod 452 def strip_byte_order_mark(cls, data): 453 """If a byte-order mark is present, strip it and return the encoding it implies. 454 455 :param data: Some markup. 456 :return: A 2-tuple (modified data, implied encoding) 457 """ 458 encoding = None 459 if isinstance(data, str): 460 # Unicode data cannot have a byte-order mark. 461 return data, encoding 462 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 463 and (data[2:4] != '\x00\x00'): 464 encoding = 'utf-16be' 465 data = data[2:] 466 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 467 and (data[2:4] != '\x00\x00'): 468 encoding = 'utf-16le' 469 data = data[2:] 470 elif data[:3] == b'\xef\xbb\xbf': 471 encoding = 'utf-8' 472 data = data[3:] 473 elif data[:4] == b'\x00\x00\xfe\xff': 474 encoding = 'utf-32be' 475 data = data[4:] 476 elif data[:4] == b'\xff\xfe\x00\x00': 477 encoding = 'utf-32le' 478 data = data[4:] 479 return data, encoding 480 481 @classmethod 482 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 483 """Given a document, tries to find its declared encoding. 484 485 An XML encoding is declared at the beginning of the document. 486 487 An HTML encoding is declared in a <meta> tag, hopefully near the 488 beginning of the document. 489 490 :param markup: Some markup. 491 :param is_html: If True, this markup is considered to be HTML. Otherwise 492 it's assumed to be XML. 493 :param search_entire_document: Since an encoding is supposed to declared near the beginning 494 of the document, most of the time it's only necessary to search a few kilobytes of data. 495 Set this to True to force this method to search the entire document. 496 """ 497 if search_entire_document: 498 xml_endpos = html_endpos = len(markup) 499 else: 500 xml_endpos = 1024 501 html_endpos = max(2048, int(len(markup) * 0.05)) 502 503 if isinstance(markup, bytes): 504 res = encoding_res[bytes] 505 else: 506 res = encoding_res[str] 507 508 xml_re = res['xml'] 509 html_re = res['html'] 510 declared_encoding = None 511 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) 512 if not declared_encoding_match and is_html: 513 declared_encoding_match = html_re.search(markup, endpos=html_endpos) 514 if declared_encoding_match is not None: 515 declared_encoding = declared_encoding_match.groups()[0] 516 if declared_encoding: 517 if isinstance(declared_encoding, bytes): 518 declared_encoding = declared_encoding.decode('ascii', 'replace') 519 return declared_encoding.lower() 520 return None 521 522class UnicodeDammit: 523 """A class for detecting the encoding of a *ML document and 524 converting it to a Unicode string. If the source encoding is 525 windows-1252, can replace MS smart quotes with their HTML or XML 526 equivalents.""" 527 528 # This dictionary maps commonly seen values for "charset" in HTML 529 # meta tags to the corresponding Python codec names. It only covers 530 # values that aren't in Python's aliases and can't be determined 531 # by the heuristics in find_codec. 532 CHARSET_ALIASES = {"macintosh": "mac-roman", 533 "x-sjis": "shift-jis"} 534 535 ENCODINGS_WITH_SMART_QUOTES = [ 536 "windows-1252", 537 "iso-8859-1", 538 "iso-8859-2", 539 ] 540 541 def __init__(self, markup, known_definite_encodings=[], 542 smart_quotes_to=None, is_html=False, exclude_encodings=[], 543 user_encodings=None, override_encodings=None 544 ): 545 """Constructor. 546 547 :param markup: A bytestring representing markup in an unknown encoding. 548 549 :param known_definite_encodings: When determining the encoding 550 of `markup`, these encodings will be tried first, in 551 order. In HTML terms, this corresponds to the "known 552 definite encoding" step defined here: 553 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding 554 555 :param user_encodings: These encodings will be tried after the 556 `known_definite_encodings` have been tried and failed, and 557 after an attempt to sniff the encoding by looking at a 558 byte order mark has failed. In HTML terms, this 559 corresponds to the step "user has explicitly instructed 560 the user agent to override the document's character 561 encoding", defined here: 562 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding 563 564 :param override_encodings: A deprecated alias for 565 known_definite_encodings. Any encodings here will be tried 566 immediately after the encodings in 567 known_definite_encodings. 568 569 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted 570 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. 571 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' 572 will convert them to HTML entity references. 573 :param is_html: If True, this markup is considered to be HTML. Otherwise 574 it's assumed to be XML. 575 :param exclude_encodings: These encodings will not be considered, even 576 if the sniffing code thinks they might make sense. 577 578 """ 579 self.smart_quotes_to = smart_quotes_to 580 self.tried_encodings = [] 581 self.contains_replacement_characters = False 582 self.is_html = is_html 583 self.log = logging.getLogger(__name__) 584 self.detector = EncodingDetector( 585 markup, known_definite_encodings, is_html, exclude_encodings, 586 user_encodings, override_encodings 587 ) 588 589 # Short-circuit if the data is in Unicode to begin with. 590 if isinstance(markup, str) or markup == '': 591 self.markup = markup 592 self.unicode_markup = str(markup) 593 self.original_encoding = None 594 return 595 596 # The encoding detector may have stripped a byte-order mark. 597 # Use the stripped markup from this point on. 598 self.markup = self.detector.markup 599 600 u = None 601 for encoding in self.detector.encodings: 602 markup = self.detector.markup 603 u = self._convert_from(encoding) 604 if u is not None: 605 break 606 607 if not u: 608 # None of the encodings worked. As an absolute last resort, 609 # try them again with character replacement. 610 611 for encoding in self.detector.encodings: 612 if encoding != "ascii": 613 u = self._convert_from(encoding, "replace") 614 if u is not None: 615 self.log.warning( 616 "Some characters could not be decoded, and were " 617 "replaced with REPLACEMENT CHARACTER." 618 ) 619 self.contains_replacement_characters = True 620 break 621 622 # If none of that worked, we could at this point force it to 623 # ASCII, but that would destroy so much data that I think 624 # giving up is better. 625 self.unicode_markup = u 626 if not u: 627 self.original_encoding = None 628 629 def _sub_ms_char(self, match): 630 """Changes a MS smart quote character to an XML or HTML 631 entity, or an ASCII character.""" 632 orig = match.group(1) 633 if self.smart_quotes_to == 'ascii': 634 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 635 else: 636 sub = self.MS_CHARS.get(orig) 637 if type(sub) == tuple: 638 if self.smart_quotes_to == 'xml': 639 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 640 else: 641 sub = '&'.encode() + sub[0].encode() + ';'.encode() 642 else: 643 sub = sub.encode() 644 return sub 645 646 def _convert_from(self, proposed, errors="strict"): 647 """Attempt to convert the markup to the proposed encoding. 648 649 :param proposed: The name of a character encoding. 650 """ 651 proposed = self.find_codec(proposed) 652 if not proposed or (proposed, errors) in self.tried_encodings: 653 return None 654 self.tried_encodings.append((proposed, errors)) 655 markup = self.markup 656 # Convert smart quotes to HTML if coming from an encoding 657 # that might have them. 658 if (self.smart_quotes_to is not None 659 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 660 smart_quotes_re = b"([\x80-\x9f])" 661 smart_quotes_compiled = re.compile(smart_quotes_re) 662 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 663 664 try: 665 #print("Trying to convert document to %s (errors=%s)" % ( 666 # proposed, errors)) 667 u = self._to_unicode(markup, proposed, errors) 668 self.markup = u 669 self.original_encoding = proposed 670 except Exception as e: 671 #print("That didn't work!") 672 #print(e) 673 return None 674 #print("Correct encoding: %s" % proposed) 675 return self.markup 676 677 def _to_unicode(self, data, encoding, errors="strict"): 678 """Given a string and its encoding, decodes the string into Unicode. 679 680 :param encoding: The name of an encoding. 681 """ 682 return str(data, encoding, errors) 683 684 @property 685 def declared_html_encoding(self): 686 """If the markup is an HTML document, returns the encoding declared _within_ 687 the document. 688 """ 689 if not self.is_html: 690 return None 691 return self.detector.declared_encoding 692 693 def find_codec(self, charset): 694 """Convert the name of a character set to a codec name. 695 696 :param charset: The name of a character set. 697 :return: The name of a codec. 698 """ 699 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 700 or (charset and self._codec(charset.replace("-", ""))) 701 or (charset and self._codec(charset.replace("-", "_"))) 702 or (charset and charset.lower()) 703 or charset 704 ) 705 if value: 706 return value.lower() 707 return None 708 709 def _codec(self, charset): 710 if not charset: 711 return charset 712 codec = None 713 try: 714 codecs.lookup(charset) 715 codec = charset 716 except (LookupError, ValueError): 717 pass 718 return codec 719 720 721 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 722 MS_CHARS = {b'\x80': ('euro', '20AC'), 723 b'\x81': ' ', 724 b'\x82': ('sbquo', '201A'), 725 b'\x83': ('fnof', '192'), 726 b'\x84': ('bdquo', '201E'), 727 b'\x85': ('hellip', '2026'), 728 b'\x86': ('dagger', '2020'), 729 b'\x87': ('Dagger', '2021'), 730 b'\x88': ('circ', '2C6'), 731 b'\x89': ('permil', '2030'), 732 b'\x8A': ('Scaron', '160'), 733 b'\x8B': ('lsaquo', '2039'), 734 b'\x8C': ('OElig', '152'), 735 b'\x8D': '?', 736 b'\x8E': ('#x17D', '17D'), 737 b'\x8F': '?', 738 b'\x90': '?', 739 b'\x91': ('lsquo', '2018'), 740 b'\x92': ('rsquo', '2019'), 741 b'\x93': ('ldquo', '201C'), 742 b'\x94': ('rdquo', '201D'), 743 b'\x95': ('bull', '2022'), 744 b'\x96': ('ndash', '2013'), 745 b'\x97': ('mdash', '2014'), 746 b'\x98': ('tilde', '2DC'), 747 b'\x99': ('trade', '2122'), 748 b'\x9a': ('scaron', '161'), 749 b'\x9b': ('rsaquo', '203A'), 750 b'\x9c': ('oelig', '153'), 751 b'\x9d': '?', 752 b'\x9e': ('#x17E', '17E'), 753 b'\x9f': ('Yuml', ''),} 754 755 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 756 # horrors like stripping diacritical marks to turn á into a, but also 757 # contains non-horrors like turning “ into ". 758 MS_CHARS_TO_ASCII = { 759 b'\x80' : 'EUR', 760 b'\x81' : ' ', 761 b'\x82' : ',', 762 b'\x83' : 'f', 763 b'\x84' : ',,', 764 b'\x85' : '...', 765 b'\x86' : '+', 766 b'\x87' : '++', 767 b'\x88' : '^', 768 b'\x89' : '%', 769 b'\x8a' : 'S', 770 b'\x8b' : '<', 771 b'\x8c' : 'OE', 772 b'\x8d' : '?', 773 b'\x8e' : 'Z', 774 b'\x8f' : '?', 775 b'\x90' : '?', 776 b'\x91' : "'", 777 b'\x92' : "'", 778 b'\x93' : '"', 779 b'\x94' : '"', 780 b'\x95' : '*', 781 b'\x96' : '-', 782 b'\x97' : '--', 783 b'\x98' : '~', 784 b'\x99' : '(TM)', 785 b'\x9a' : 's', 786 b'\x9b' : '>', 787 b'\x9c' : 'oe', 788 b'\x9d' : '?', 789 b'\x9e' : 'z', 790 b'\x9f' : 'Y', 791 b'\xa0' : ' ', 792 b'\xa1' : '!', 793 b'\xa2' : 'c', 794 b'\xa3' : 'GBP', 795 b'\xa4' : '$', #This approximation is especially parochial--this is the 796 #generic currency symbol. 797 b'\xa5' : 'YEN', 798 b'\xa6' : '|', 799 b'\xa7' : 'S', 800 b'\xa8' : '..', 801 b'\xa9' : '', 802 b'\xaa' : '(th)', 803 b'\xab' : '<<', 804 b'\xac' : '!', 805 b'\xad' : ' ', 806 b'\xae' : '(R)', 807 b'\xaf' : '-', 808 b'\xb0' : 'o', 809 b'\xb1' : '+-', 810 b'\xb2' : '2', 811 b'\xb3' : '3', 812 b'\xb4' : ("'", 'acute'), 813 b'\xb5' : 'u', 814 b'\xb6' : 'P', 815 b'\xb7' : '*', 816 b'\xb8' : ',', 817 b'\xb9' : '1', 818 b'\xba' : '(th)', 819 b'\xbb' : '>>', 820 b'\xbc' : '1/4', 821 b'\xbd' : '1/2', 822 b'\xbe' : '3/4', 823 b'\xbf' : '?', 824 b'\xc0' : 'A', 825 b'\xc1' : 'A', 826 b'\xc2' : 'A', 827 b'\xc3' : 'A', 828 b'\xc4' : 'A', 829 b'\xc5' : 'A', 830 b'\xc6' : 'AE', 831 b'\xc7' : 'C', 832 b'\xc8' : 'E', 833 b'\xc9' : 'E', 834 b'\xca' : 'E', 835 b'\xcb' : 'E', 836 b'\xcc' : 'I', 837 b'\xcd' : 'I', 838 b'\xce' : 'I', 839 b'\xcf' : 'I', 840 b'\xd0' : 'D', 841 b'\xd1' : 'N', 842 b'\xd2' : 'O', 843 b'\xd3' : 'O', 844 b'\xd4' : 'O', 845 b'\xd5' : 'O', 846 b'\xd6' : 'O', 847 b'\xd7' : '*', 848 b'\xd8' : 'O', 849 b'\xd9' : 'U', 850 b'\xda' : 'U', 851 b'\xdb' : 'U', 852 b'\xdc' : 'U', 853 b'\xdd' : 'Y', 854 b'\xde' : 'b', 855 b'\xdf' : 'B', 856 b'\xe0' : 'a', 857 b'\xe1' : 'a', 858 b'\xe2' : 'a', 859 b'\xe3' : 'a', 860 b'\xe4' : 'a', 861 b'\xe5' : 'a', 862 b'\xe6' : 'ae', 863 b'\xe7' : 'c', 864 b'\xe8' : 'e', 865 b'\xe9' : 'e', 866 b'\xea' : 'e', 867 b'\xeb' : 'e', 868 b'\xec' : 'i', 869 b'\xed' : 'i', 870 b'\xee' : 'i', 871 b'\xef' : 'i', 872 b'\xf0' : 'o', 873 b'\xf1' : 'n', 874 b'\xf2' : 'o', 875 b'\xf3' : 'o', 876 b'\xf4' : 'o', 877 b'\xf5' : 'o', 878 b'\xf6' : 'o', 879 b'\xf7' : '/', 880 b'\xf8' : 'o', 881 b'\xf9' : 'u', 882 b'\xfa' : 'u', 883 b'\xfb' : 'u', 884 b'\xfc' : 'u', 885 b'\xfd' : 'y', 886 b'\xfe' : 'b', 887 b'\xff' : 'y', 888 } 889 890 # A map used when removing rogue Windows-1252/ISO-8859-1 891 # characters in otherwise UTF-8 documents. 892 # 893 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 894 # Windows-1252. 895 WINDOWS_1252_TO_UTF8 = { 896 0x80 : b'\xe2\x82\xac', # € 897 0x82 : b'\xe2\x80\x9a', # ‚ 898 0x83 : b'\xc6\x92', # ƒ 899 0x84 : b'\xe2\x80\x9e', # „ 900 0x85 : b'\xe2\x80\xa6', # … 901 0x86 : b'\xe2\x80\xa0', # † 902 0x87 : b'\xe2\x80\xa1', # ‡ 903 0x88 : b'\xcb\x86', # ˆ 904 0x89 : b'\xe2\x80\xb0', # ‰ 905 0x8a : b'\xc5\xa0', # Š 906 0x8b : b'\xe2\x80\xb9', # ‹ 907 0x8c : b'\xc5\x92', # Œ 908 0x8e : b'\xc5\xbd', # Ž 909 0x91 : b'\xe2\x80\x98', # ‘ 910 0x92 : b'\xe2\x80\x99', # ’ 911 0x93 : b'\xe2\x80\x9c', # “ 912 0x94 : b'\xe2\x80\x9d', # ” 913 0x95 : b'\xe2\x80\xa2', # • 914 0x96 : b'\xe2\x80\x93', # – 915 0x97 : b'\xe2\x80\x94', # — 916 0x98 : b'\xcb\x9c', # ˜ 917 0x99 : b'\xe2\x84\xa2', # ™ 918 0x9a : b'\xc5\xa1', # š 919 0x9b : b'\xe2\x80\xba', # › 920 0x9c : b'\xc5\x93', # œ 921 0x9e : b'\xc5\xbe', # ž 922 0x9f : b'\xc5\xb8', # Ÿ 923 0xa0 : b'\xc2\xa0', # 924 0xa1 : b'\xc2\xa1', # ¡ 925 0xa2 : b'\xc2\xa2', # ¢ 926 0xa3 : b'\xc2\xa3', # £ 927 0xa4 : b'\xc2\xa4', # ¤ 928 0xa5 : b'\xc2\xa5', # ¥ 929 0xa6 : b'\xc2\xa6', # ¦ 930 0xa7 : b'\xc2\xa7', # § 931 0xa8 : b'\xc2\xa8', # ¨ 932 0xa9 : b'\xc2\xa9', # © 933 0xaa : b'\xc2\xaa', # ª 934 0xab : b'\xc2\xab', # « 935 0xac : b'\xc2\xac', # ¬ 936 0xad : b'\xc2\xad', # 937 0xae : b'\xc2\xae', # ® 938 0xaf : b'\xc2\xaf', # ¯ 939 0xb0 : b'\xc2\xb0', # ° 940 0xb1 : b'\xc2\xb1', # ± 941 0xb2 : b'\xc2\xb2', # ² 942 0xb3 : b'\xc2\xb3', # ³ 943 0xb4 : b'\xc2\xb4', # ´ 944 0xb5 : b'\xc2\xb5', # µ 945 0xb6 : b'\xc2\xb6', # ¶ 946 0xb7 : b'\xc2\xb7', # · 947 0xb8 : b'\xc2\xb8', # ¸ 948 0xb9 : b'\xc2\xb9', # ¹ 949 0xba : b'\xc2\xba', # º 950 0xbb : b'\xc2\xbb', # » 951 0xbc : b'\xc2\xbc', # ¼ 952 0xbd : b'\xc2\xbd', # ½ 953 0xbe : b'\xc2\xbe', # ¾ 954 0xbf : b'\xc2\xbf', # ¿ 955 0xc0 : b'\xc3\x80', # À 956 0xc1 : b'\xc3\x81', # Á 957 0xc2 : b'\xc3\x82', #  958 0xc3 : b'\xc3\x83', # à 959 0xc4 : b'\xc3\x84', # Ä 960 0xc5 : b'\xc3\x85', # Å 961 0xc6 : b'\xc3\x86', # Æ 962 0xc7 : b'\xc3\x87', # Ç 963 0xc8 : b'\xc3\x88', # È 964 0xc9 : b'\xc3\x89', # É 965 0xca : b'\xc3\x8a', # Ê 966 0xcb : b'\xc3\x8b', # Ë 967 0xcc : b'\xc3\x8c', # Ì 968 0xcd : b'\xc3\x8d', # Í 969 0xce : b'\xc3\x8e', # Î 970 0xcf : b'\xc3\x8f', # Ï 971 0xd0 : b'\xc3\x90', # Ð 972 0xd1 : b'\xc3\x91', # Ñ 973 0xd2 : b'\xc3\x92', # Ò 974 0xd3 : b'\xc3\x93', # Ó 975 0xd4 : b'\xc3\x94', # Ô 976 0xd5 : b'\xc3\x95', # Õ 977 0xd6 : b'\xc3\x96', # Ö 978 0xd7 : b'\xc3\x97', # × 979 0xd8 : b'\xc3\x98', # Ø 980 0xd9 : b'\xc3\x99', # Ù 981 0xda : b'\xc3\x9a', # Ú 982 0xdb : b'\xc3\x9b', # Û 983 0xdc : b'\xc3\x9c', # Ü 984 0xdd : b'\xc3\x9d', # Ý 985 0xde : b'\xc3\x9e', # Þ 986 0xdf : b'\xc3\x9f', # ß 987 0xe0 : b'\xc3\xa0', # à 988 0xe1 : b'\xa1', # á 989 0xe2 : b'\xc3\xa2', # â 990 0xe3 : b'\xc3\xa3', # ã 991 0xe4 : b'\xc3\xa4', # ä 992 0xe5 : b'\xc3\xa5', # å 993 0xe6 : b'\xc3\xa6', # æ 994 0xe7 : b'\xc3\xa7', # ç 995 0xe8 : b'\xc3\xa8', # è 996 0xe9 : b'\xc3\xa9', # é 997 0xea : b'\xc3\xaa', # ê 998 0xeb : b'\xc3\xab', # ë 999 0xec : b'\xc3\xac', # ì 1000 0xed : b'\xc3\xad', # í 1001 0xee : b'\xc3\xae', # î 1002 0xef : b'\xc3\xaf', # ï 1003 0xf0 : b'\xc3\xb0', # ð 1004 0xf1 : b'\xc3\xb1', # ñ 1005 0xf2 : b'\xc3\xb2', # ò 1006 0xf3 : b'\xc3\xb3', # ó 1007 0xf4 : b'\xc3\xb4', # ô 1008 0xf5 : b'\xc3\xb5', # õ 1009 0xf6 : b'\xc3\xb6', # ö 1010 0xf7 : b'\xc3\xb7', # ÷ 1011 0xf8 : b'\xc3\xb8', # ø 1012 0xf9 : b'\xc3\xb9', # ù 1013 0xfa : b'\xc3\xba', # ú 1014 0xfb : b'\xc3\xbb', # û 1015 0xfc : b'\xc3\xbc', # ü 1016 0xfd : b'\xc3\xbd', # ý 1017 0xfe : b'\xc3\xbe', # þ 1018 } 1019 1020 MULTIBYTE_MARKERS_AND_SIZES = [ 1021 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 1022 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 1023 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 1024 ] 1025 1026 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 1027 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 1028 1029 @classmethod 1030 def detwingle(cls, in_bytes, main_encoding="utf8", 1031 embedded_encoding="windows-1252"): 1032 """Fix characters from one encoding embedded in some other encoding. 1033 1034 Currently the only situation supported is Windows-1252 (or its 1035 subset ISO-8859-1), embedded in UTF-8. 1036 1037 :param in_bytes: A bytestring that you suspect contains 1038 characters from multiple encodings. Note that this _must_ 1039 be a bytestring. If you've already converted the document 1040 to Unicode, you're too late. 1041 :param main_encoding: The primary encoding of `in_bytes`. 1042 :param embedded_encoding: The encoding that was used to embed characters 1043 in the main document. 1044 :return: A bytestring in which `embedded_encoding` 1045 characters have been converted to their `main_encoding` 1046 equivalents. 1047 """ 1048 if embedded_encoding.replace('_', '-').lower() not in ( 1049 'windows-1252', 'windows_1252'): 1050 raise NotImplementedError( 1051 "Windows-1252 and ISO-8859-1 are the only currently supported " 1052 "embedded encodings.") 1053 1054 if main_encoding.lower() not in ('utf8', 'utf-8'): 1055 raise NotImplementedError( 1056 "UTF-8 is the only currently supported main encoding.") 1057 1058 byte_chunks = [] 1059 1060 chunk_start = 0 1061 pos = 0 1062 while pos < len(in_bytes): 1063 byte = in_bytes[pos] 1064 if not isinstance(byte, int): 1065 # Python 2.x 1066 byte = ord(byte) 1067 if (byte >= cls.FIRST_MULTIBYTE_MARKER 1068 and byte <= cls.LAST_MULTIBYTE_MARKER): 1069 # This is the start of a UTF-8 multibyte character. Skip 1070 # to the end. 1071 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 1072 if byte >= start and byte <= end: 1073 pos += size 1074 break 1075 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 1076 # We found a Windows-1252 character! 1077 # Save the string up to this point as a chunk. 1078 byte_chunks.append(in_bytes[chunk_start:pos]) 1079 1080 # Now translate the Windows-1252 character into UTF-8 1081 # and add it as another, one-byte chunk. 1082 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 1083 pos += 1 1084 chunk_start = pos 1085 else: 1086 # Go on to the next character. 1087 pos += 1 1088 if chunk_start == 0: 1089 # The string is unchanged. 1090 return in_bytes 1091 else: 1092 # Store the final chunk. 1093 byte_chunks.append(in_bytes[chunk_start:]) 1094 return b''.join(byte_chunks) 1095 1096