1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
9# Use of this source code is governed by the MIT license.
10__license__ = "MIT"
12from html.entities import codepoint2name
13from collections import defaultdict
14import codecs
15import re
16import logging
17import string
19# Import a library to autodetect character encodings. We'll support
20# any of a number of libraries that all support the same API:
22# * cchardet
23# * chardet
24# * charset-normalizer
25chardet_module = None
27    #  PyPI package: cchardet
28    import cchardet as chardet_module
29except ImportError:
30    try:
31        #  Debian package: python-chardet
32        #  PyPI package: chardet
33        import chardet as chardet_module
34    except ImportError:
35        try:
36            # PyPI package: charset-normalizer
37            import charset_normalizer as chardet_module
38        except ImportError:
39            # No chardet available.
40            chardet_module = None
42if chardet_module:
43    def chardet_dammit(s):
44        if isinstance(s, str):
45            return None
46        return chardet_module.detect(s)['encoding']
48    def chardet_dammit(s):
49        return None
51# Build bytestring and Unicode versions of regular expressions for finding
52# a declared encoding inside an XML or HTML document.
53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
55encoding_res = dict()
56encoding_res[bytes] = {
57    'html' : re.compile(html_meta.encode("ascii"), re.I),
58    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
60encoding_res[str] = {
61    'html' : re.compile(html_meta, re.I),
62    'xml' : re.compile(xml_encoding, re.I)
65from html.entities import html5
67class EntitySubstitution(object):
68    """The ability to substitute XML or HTML entities for certain characters."""
70    def _populate_class_variables():
71        """Initialize variables used by this class to manage the plethora of
72        HTML5 named entities.
74        This function returns a 3-tuple containing two dictionaries
75        and a regular expression:
77        unicode_to_name - A mapping of Unicode strings like "⦨" to
78        entity names like "angmsdaa". When a single Unicode string has
79        multiple entity names, we try to choose the most commonly-used
80        name.
82        name_to_unicode: A mapping of entity names like "angmsdaa" to
83        Unicode strings like "⦨".
85        named_entity_re: A regular expression matching (almost) any
86        Unicode string that corresponds to an HTML5 named entity.
87        """
88        unicode_to_name = {}
89        name_to_unicode = {}
91        short_entities = set()
92        long_entities_by_first_character = defaultdict(set)
94        for name_with_semicolon, character in sorted(html5.items()):
95            # "It is intentional, for legacy compatibility, that many
96            # code points have multiple character reference names. For
97            # example, some appear both with and without the trailing
98            # semicolon, or with different capitalizations."
99            # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
100            #
101            # The parsers are in charge of handling (or not) character
102            # references with no trailing semicolon, so we remove the
103            # semicolon whenever it appears.
104            if name_with_semicolon.endswith(';'):
105                name = name_with_semicolon[:-1]
106            else:
107                name = name_with_semicolon
109            # When parsing HTML, we want to recognize any known named
110            # entity and convert it to a sequence of Unicode
111            # characters.
112            if name not in name_to_unicode:
113                name_to_unicode[name] = character
115            # When _generating_ HTML, we want to recognize special
116            # character sequences that _could_ be converted to named
117            # entities.
118            unicode_to_name[character] = name
120            # We also need to build a regular expression that lets us
121            # _find_ those characters in output strings so we can
122            # replace them.
123            #
124            # This is tricky, for two reasons.
126            if (len(character) == 1 and ord(character) < 128
127                and character not in '<>&'):
128                # First, it would be annoying to turn single ASCII
129                # characters like | into named entities like
130                # &verbar;. The exceptions are <>&, which we _must_
131                # turn into named entities to produce valid HTML.
132                continue
134            if len(character) > 1 and all(ord(x) < 128 for x in character):
135                # We also do not want to turn _combinations_ of ASCII
136                # characters like 'fj' into named entities like '&fjlig;',
137                # though that's more debateable.
138                continue
140            # Second, some named entities have a Unicode value that's
141            # a subset of the Unicode value for some _other_ named
142            # entity.  As an example, \u2267' is &GreaterFullEqual;,
143            # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
144            # expression needs to match the first two characters of
145            # "\u2267\u0338foo", but only the first character of
146            # "\u2267foo".
147            #
148            # In this step, we build two sets of characters that
149            # _eventually_ need to go into the regular expression. But
150            # we won't know exactly what the regular expression needs
151            # to look like until we've gone through the entire list of
152            # named entities.
153            if len(character) == 1:
154                short_entities.add(character)
155            else:
156                long_entities_by_first_character[character[0]].add(character)
158        # Now that we've been through the entire list of entities, we
159        # can create a regular expression that matches any of them.
160        particles = set()
161        for short in short_entities:
162            long_versions = long_entities_by_first_character[short]
163            if not long_versions:
164                particles.add(short)
165            else:
166                ignore = "".join([x[1] for x in long_versions])
167                # This finds, e.g. \u2267 but only if it is _not_
168                # followed by \u0338.
169                particles.add("%s(?![%s])" % (short, ignore))
171        for long_entities in list(long_entities_by_first_character.values()):
172            for long_entity in long_entities:
173                particles.add(long_entity)
175        re_definition = "(%s)" % "|".join(particles)
177        # If an entity shows up in both html5 and codepoint2name, it's
178        # likely that HTML5 gives it several different names, such as
179        # 'rsquo' and 'rsquor'. When converting Unicode characters to
180        # named entities, the codepoint2name name should take
181        # precedence where possible, since that's the more easily
182        # recognizable one.
183        for codepoint, name in list(codepoint2name.items()):
184            character = chr(codepoint)
185            unicode_to_name[character] = name
187        return unicode_to_name, name_to_unicode, re.compile(re_definition)
189     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
192        "'": "apos",
193        '"': "quot",
194        "&": "amp",
195        "<": "lt",
196        ">": "gt",
197        }
199    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
200                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
201                                           ")")
203    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
205    @classmethod
206    def _substitute_html_entity(cls, matchobj):
207        """Used with a regular expression to substitute the
208        appropriate HTML entity for a special character string."""
209        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
210        return "&%s;" % entity
212    @classmethod
213    def _substitute_xml_entity(cls, matchobj):
214        """Used with a regular expression to substitute the
215        appropriate XML entity for a special character string."""
216        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
217        return "&%s;" % entity
219    @classmethod
220    def quoted_attribute_value(self, value):
221        """Make a value into a quoted XML attribute, possibly escaping it.
223         Most strings will be quoted using double quotes.
225          Bob's Bar -> "Bob's Bar"
227         If a string contains double quotes, it will be quoted using
228         single quotes.
230          Welcome to "my bar" -> 'Welcome to "my bar"'
232         If a string contains both single and double quotes, the
233         double quotes will be escaped, and the string will be quoted
234         using double quotes.
236          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
237        """
238        quote_with = '"'
239        if '"' in value:
240            if "'" in value:
241                # The string contains both single and double
242                # quotes.  Turn the double quotes into
243                # entities. We quote the double quotes rather than
244                # the single quotes because the entity name is
245                # "&quot;" whether this is HTML or XML.  If we
246                # quoted the single quotes, we'd have to decide
247                # between &apos; and &squot;.
248                replace_with = "&quot;"
249                value = value.replace('"', replace_with)
250            else:
251                # There are double quotes but no single quotes.
252                # We can use single quotes to quote the attribute.
253                quote_with = "'"
254        return quote_with + value + quote_with
256    @classmethod
257    def substitute_xml(cls, value, make_quoted_attribute=False):
258        """Substitute XML entities for special XML characters.
260        :param value: A string to be substituted. The less-than sign
261          will become &lt;, the greater-than sign will become &gt;,
262          and any ampersands will become &amp;. If you want ampersands
263          that appear to be part of an entity definition to be left
264          alone, use substitute_xml_containing_entities() instead.
266        :param make_quoted_attribute: If True, then the string will be
267         quoted, as befits an attribute value.
268        """
269        # Escape angle brackets and ampersands.
270        value = cls.AMPERSAND_OR_BRACKET.sub(
271            cls._substitute_xml_entity, value)
273        if make_quoted_attribute:
274            value = cls.quoted_attribute_value(value)
275        return value
277    @classmethod
278    def substitute_xml_containing_entities(
279        cls, value, make_quoted_attribute=False):
280        """Substitute XML entities for special XML characters.
282        :param value: A string to be substituted. The less-than sign will
283          become &lt;, the greater-than sign will become &gt;, and any
284          ampersands that are not part of an entity defition will
285          become &amp;.
287        :param make_quoted_attribute: If True, then the string will be
288         quoted, as befits an attribute value.
289        """
290        # Escape angle brackets, and ampersands that aren't part of
291        # entities.
292        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
293            cls._substitute_xml_entity, value)
295        if make_quoted_attribute:
296            value = cls.quoted_attribute_value(value)
297        return value
299    @classmethod
300    def substitute_html(cls, s):
301        """Replace certain Unicode characters with named HTML entities.
303        This differs from data.encode(encoding, 'xmlcharrefreplace')
304        in that the goal is to make the result more readable (to those
305        with ASCII displays) rather than to recover from
306        errors. There's absolutely nothing wrong with a UTF-8 string
307        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
308        character with "&eacute;" will make it more readable to some
309        people.
311        :param s: A Unicode string.
312        """
313        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
314            cls._substitute_html_entity, s)
317class EncodingDetector:
318    """Suggests a number of possible encodings for a bytestring.
320    Order of precedence:
322    1. Encodings you specifically tell EncodingDetector to try first
323    (the known_definite_encodings argument to the constructor).
325    2. An encoding determined by sniffing the document's byte-order mark.
327    3. Encodings you specifically tell EncodingDetector to try if
328    byte-order mark sniffing fails (the user_encodings argument to the
329    constructor).
331    4. An encoding declared within the bytestring itself, either in an
332    XML declaration (if the bytestring is to be interpreted as an XML
333    document), or in a <meta> tag (if the bytestring is to be
334    interpreted as an HTML document.)
336    5. An encoding detected through textual analysis by chardet,
337    cchardet, or a similar external library.
339    4. UTF-8.
341    5. Windows-1252.
343    """
344    def __init__(self, markup, known_definite_encodings=None,
345                 is_html=False, exclude_encodings=None,
346                 user_encodings=None, override_encodings=None):
347        """Constructor.
349        :param markup: Some markup in an unknown encoding.
351        :param known_definite_encodings: When determining the encoding
352            of `markup`, these encodings will be tried first, in
353            order. In HTML terms, this corresponds to the "known
354            definite encoding" step defined here:
355            https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
357        :param user_encodings: These encodings will be tried after the
358            `known_definite_encodings` have been tried and failed, and
359            after an attempt to sniff the encoding by looking at a
360            byte order mark has failed. In HTML terms, this
361            corresponds to the step "user has explicitly instructed
362            the user agent to override the document's character
363            encoding", defined here:
364            https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
366        :param override_encodings: A deprecated alias for
367            known_definite_encodings. Any encodings here will be tried
368            immediately after the encodings in
369            known_definite_encodings.
371        :param is_html: If True, this markup is considered to be
372            HTML. Otherwise it's assumed to be XML.
374        :param exclude_encodings: These encodings will not be tried,
375            even if they otherwise would be.
377        """
378        self.known_definite_encodings = list(known_definite_encodings or [])
379        if override_encodings:
380            self.known_definite_encodings += override_encodings
381        self.user_encodings = user_encodings or []
382        exclude_encodings = exclude_encodings or []
383        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
384        self.chardet_encoding = None
385        self.is_html = is_html
386        self.declared_encoding = None
388        # First order of business: strip a byte-order mark.
389        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
391    def _usable(self, encoding, tried):
392        """Should we even bother to try this encoding?
394        :param encoding: Name of an encoding.
395        :param tried: Encodings that have already been tried. This will be modified
396            as a side effect.
397        """
398        if encoding is not None:
399            encoding = encoding.lower()
400            if encoding in self.exclude_encodings:
401                return False
402            if encoding not in tried:
403                tried.add(encoding)
404                return True
405        return False
407    @property
408    def encodings(self):
409        """Yield a number of encodings that might work for this markup.
411        :yield: A sequence of strings.
412        """
413        tried = set()
415        # First, try the known definite encodings
416        for e in self.known_definite_encodings:
417            if self._usable(e, tried):
418                yield e
420        # Did the document originally start with a byte-order mark
421        # that indicated its encoding?
422        if self._usable(self.sniffed_encoding, tried):
423            yield self.sniffed_encoding
425        # Sniffing the byte-order mark did nothing; try the user
426        # encodings.
427        for e in self.user_encodings:
428            if self._usable(e, tried):
429                yield e
431        # Look within the document for an XML or HTML encoding
432        # declaration.
433        if self.declared_encoding is None:
434            self.declared_encoding = self.find_declared_encoding(
435                self.markup, self.is_html)
436        if self._usable(self.declared_encoding, tried):
437            yield self.declared_encoding
439        # Use third-party character set detection to guess at the
440        # encoding.
441        if self.chardet_encoding is None:
442            self.chardet_encoding = chardet_dammit(self.markup)
443        if self._usable(self.chardet_encoding, tried):
444            yield self.chardet_encoding
446        # As a last-ditch effort, try utf-8 and windows-1252.
447        for e in ('utf-8', 'windows-1252'):
448            if self._usable(e, tried):
449                yield e
451    @classmethod
452    def strip_byte_order_mark(cls, data):
453        """If a byte-order mark is present, strip it and return the encoding it implies.
455        :param data: Some markup.
456        :return: A 2-tuple (modified data, implied encoding)
457        """
458        encoding = None
459        if isinstance(data, str):
460            # Unicode data cannot have a byte-order mark.
461            return data, encoding
462        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
463               and (data[2:4] != '\x00\x00'):
464            encoding = 'utf-16be'
465            data = data[2:]
466        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
467                 and (data[2:4] != '\x00\x00'):
468            encoding = 'utf-16le'
469            data = data[2:]
470        elif data[:3] == b'\xef\xbb\xbf':
471            encoding = 'utf-8'
472            data = data[3:]
473        elif data[:4] == b'\x00\x00\xfe\xff':
474            encoding = 'utf-32be'
475            data = data[4:]
476        elif data[:4] == b'\xff\xfe\x00\x00':
477            encoding = 'utf-32le'
478            data = data[4:]
479        return data, encoding
481    @classmethod
482    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
483        """Given a document, tries to find its declared encoding.
485        An XML encoding is declared at the beginning of the document.
487        An HTML encoding is declared in a <meta> tag, hopefully near the
488        beginning of the document.
490        :param markup: Some markup.
491        :param is_html: If True, this markup is considered to be HTML. Otherwise
492            it's assumed to be XML.
493        :param search_entire_document: Since an encoding is supposed to declared near the beginning
494            of the document, most of the time it's only necessary to search a few kilobytes of data.
495            Set this to True to force this method to search the entire document.
496        """
497        if search_entire_document:
498            xml_endpos = html_endpos = len(markup)
499        else:
500            xml_endpos = 1024
501            html_endpos = max(2048, int(len(markup) * 0.05))
503        if isinstance(markup, bytes):
504            res = encoding_res[bytes]
505        else:
506            res = encoding_res[str]
508        xml_re = res['xml']
509        html_re = res['html']
510        declared_encoding = None
511        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
512        if not declared_encoding_match and is_html:
513            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
514        if declared_encoding_match is not None:
515            declared_encoding = declared_encoding_match.groups()[0]
516        if declared_encoding:
517            if isinstance(declared_encoding, bytes):
518                declared_encoding = declared_encoding.decode('ascii', 'replace')
519            return declared_encoding.lower()
520        return None
522class UnicodeDammit:
523    """A class for detecting the encoding of a *ML document and
524    converting it to a Unicode string. If the source encoding is
525    windows-1252, can replace MS smart quotes with their HTML or XML
526    equivalents."""
528    # This dictionary maps commonly seen values for "charset" in HTML
529    # meta tags to the corresponding Python codec names. It only covers
530    # values that aren't in Python's aliases and can't be determined
531    # by the heuristics in find_codec.
532    CHARSET_ALIASES = {"macintosh": "mac-roman",
533                       "x-sjis": "shift-jis"}
536        "windows-1252",
537        "iso-8859-1",
538        "iso-8859-2",
539        ]
541    def __init__(self, markup, known_definite_encodings=[],
542                 smart_quotes_to=None, is_html=False, exclude_encodings=[],
543                 user_encodings=None, override_encodings=None
544    ):
545        """Constructor.
547        :param markup: A bytestring representing markup in an unknown encoding.
549        :param known_definite_encodings: When determining the encoding
550            of `markup`, these encodings will be tried first, in
551            order. In HTML terms, this corresponds to the "known
552            definite encoding" step defined here:
553            https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
555        :param user_encodings: These encodings will be tried after the
556            `known_definite_encodings` have been tried and failed, and
557            after an attempt to sniff the encoding by looking at a
558            byte order mark has failed. In HTML terms, this
559            corresponds to the step "user has explicitly instructed
560            the user agent to override the document's character
561            encoding", defined here:
562            https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
564        :param override_encodings: A deprecated alias for
565            known_definite_encodings. Any encodings here will be tried
566            immediately after the encodings in
567            known_definite_encodings.
569        :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
570           to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
571           Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
572           will convert them to HTML entity references.
573        :param is_html: If True, this markup is considered to be HTML. Otherwise
574            it's assumed to be XML.
575        :param exclude_encodings: These encodings will not be considered, even
576            if the sniffing code thinks they might make sense.
578        """
579        self.smart_quotes_to = smart_quotes_to
580        self.tried_encodings = []
581        self.contains_replacement_characters = False
582        self.is_html = is_html
583        self.log = logging.getLogger(__name__)
584        self.detector = EncodingDetector(
585            markup, known_definite_encodings, is_html, exclude_encodings,
586            user_encodings, override_encodings
587        )
589        # Short-circuit if the data is in Unicode to begin with.
590        if isinstance(markup, str) or markup == '':
591            self.markup = markup
592            self.unicode_markup = str(markup)
593            self.original_encoding = None
594            return
596        # The encoding detector may have stripped a byte-order mark.
597        # Use the stripped markup from this point on.
598        self.markup = self.detector.markup
600        u = None
601        for encoding in self.detector.encodings:
602            markup = self.detector.markup
603            u = self._convert_from(encoding)
604            if u is not None:
605                break
607        if not u:
608            # None of the encodings worked. As an absolute last resort,
609            # try them again with character replacement.
611            for encoding in self.detector.encodings:
612                if encoding != "ascii":
613                    u = self._convert_from(encoding, "replace")
614                if u is not None:
615                    self.log.warning(
616                            "Some characters could not be decoded, and were "
617                            "replaced with REPLACEMENT CHARACTER."
618                    )
619                    self.contains_replacement_characters = True
620                    break
622        # If none of that worked, we could at this point force it to
623        # ASCII, but that would destroy so much data that I think
624        # giving up is better.
625        self.unicode_markup = u
626        if not u:
627            self.original_encoding = None
629    def _sub_ms_char(self, match):
630        """Changes a MS smart quote character to an XML or HTML
631        entity, or an ASCII character."""
632        orig = match.group(1)
633        if self.smart_quotes_to == 'ascii':
634            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
635        else:
636            sub = self.MS_CHARS.get(orig)
637            if type(sub) == tuple:
638                if self.smart_quotes_to == 'xml':
639                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
640                else:
641                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
642            else:
643                sub = sub.encode()
644        return sub
646    def _convert_from(self, proposed, errors="strict"):
647        """Attempt to convert the markup to the proposed encoding.
649        :param proposed: The name of a character encoding.
650        """
651        proposed = self.find_codec(proposed)
652        if not proposed or (proposed, errors) in self.tried_encodings:
653            return None
654        self.tried_encodings.append((proposed, errors))
655        markup = self.markup
656        # Convert smart quotes to HTML if coming from an encoding
657        # that might have them.
658        if (self.smart_quotes_to is not None
659            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
660            smart_quotes_re = b"([\x80-\x9f])"
661            smart_quotes_compiled = re.compile(smart_quotes_re)
662            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
664        try:
665            #print("Trying to convert document to %s (errors=%s)" % (
666            #    proposed, errors))
667            u = self._to_unicode(markup, proposed, errors)
668            self.markup = u
669            self.original_encoding = proposed
670        except Exception as e:
671            #print("That didn't work!")
672            #print(e)
673            return None
674        #print("Correct encoding: %s" % proposed)
675        return self.markup
677    def _to_unicode(self, data, encoding, errors="strict"):
678        """Given a string and its encoding, decodes the string into Unicode.
680        :param encoding: The name of an encoding.
681        """
682        return str(data, encoding, errors)
684    @property
685    def declared_html_encoding(self):
686        """If the markup is an HTML document, returns the encoding declared _within_
687        the document.
688        """
689        if not self.is_html:
690            return None
691        return self.detector.declared_encoding
693    def find_codec(self, charset):
694        """Convert the name of a character set to a codec name.
696        :param charset: The name of a character set.
697        :return: The name of a codec.
698        """
699        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
700               or (charset and self._codec(charset.replace("-", "")))
701               or (charset and self._codec(charset.replace("-", "_")))
702               or (charset and charset.lower())
703               or charset
704                )
705        if value:
706            return value.lower()
707        return None
709    def _codec(self, charset):
710        if not charset:
711            return charset
712        codec = None
713        try:
714            codecs.lookup(charset)
715            codec = charset
716        except (LookupError, ValueError):
717            pass
718        return codec
721    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
722    MS_CHARS = {b'\x80': ('euro', '20AC'),
723                b'\x81': ' ',
724                b'\x82': ('sbquo', '201A'),
725                b'\x83': ('fnof', '192'),
726                b'\x84': ('bdquo', '201E'),
727                b'\x85': ('hellip', '2026'),
728                b'\x86': ('dagger', '2020'),
729                b'\x87': ('Dagger', '2021'),
730                b'\x88': ('circ', '2C6'),
731                b'\x89': ('permil', '2030'),
732                b'\x8A': ('Scaron', '160'),
733                b'\x8B': ('lsaquo', '2039'),
734                b'\x8C': ('OElig', '152'),
735                b'\x8D': '?',
736                b'\x8E': ('#x17D', '17D'),
737                b'\x8F': '?',
738                b'\x90': '?',
739                b'\x91': ('lsquo', '2018'),
740                b'\x92': ('rsquo', '2019'),
741                b'\x93': ('ldquo', '201C'),
742                b'\x94': ('rdquo', '201D'),
743                b'\x95': ('bull', '2022'),
744                b'\x96': ('ndash', '2013'),
745                b'\x97': ('mdash', '2014'),
746                b'\x98': ('tilde', '2DC'),
747                b'\x99': ('trade', '2122'),
748                b'\x9a': ('scaron', '161'),
749                b'\x9b': ('rsaquo', '203A'),
750                b'\x9c': ('oelig', '153'),
751                b'\x9d': '?',
752                b'\x9e': ('#x17E', '17E'),
753                b'\x9f': ('Yuml', ''),}
755    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
756    # horrors like stripping diacritical marks to turn á into a, but also
757    # contains non-horrors like turning “ into ".
758    MS_CHARS_TO_ASCII = {
759        b'\x80' : 'EUR',
760        b'\x81' : ' ',
761        b'\x82' : ',',
762        b'\x83' : 'f',
763        b'\x84' : ',,',
764        b'\x85' : '...',
765        b'\x86' : '+',
766        b'\x87' : '++',
767        b'\x88' : '^',
768        b'\x89' : '%',
769        b'\x8a' : 'S',
770        b'\x8b' : '<',
771        b'\x8c' : 'OE',
772        b'\x8d' : '?',
773        b'\x8e' : 'Z',
774        b'\x8f' : '?',
775        b'\x90' : '?',
776        b'\x91' : "'",
777        b'\x92' : "'",
778        b'\x93' : '"',
779        b'\x94' : '"',
780        b'\x95' : '*',
781        b'\x96' : '-',
782        b'\x97' : '--',
783        b'\x98' : '~',
784        b'\x99' : '(TM)',
785        b'\x9a' : 's',
786        b'\x9b' : '>',
787        b'\x9c' : 'oe',
788        b'\x9d' : '?',
789        b'\x9e' : 'z',
790        b'\x9f' : 'Y',
791        b'\xa0' : ' ',
792        b'\xa1' : '!',
793        b'\xa2' : 'c',
794        b'\xa3' : 'GBP',
795        b'\xa4' : '$', #This approximation is especially parochial--this is the
796                       #generic currency symbol.
797        b'\xa5' : 'YEN',
798        b'\xa6' : '|',
799        b'\xa7' : 'S',
800        b'\xa8' : '..',
801        b'\xa9' : '',
802        b'\xaa' : '(th)',
803        b'\xab' : '<<',
804        b'\xac' : '!',
805        b'\xad' : ' ',
806        b'\xae' : '(R)',
807        b'\xaf' : '-',
808        b'\xb0' : 'o',
809        b'\xb1' : '+-',
810        b'\xb2' : '2',
811        b'\xb3' : '3',
812        b'\xb4' : ("'", 'acute'),
813        b'\xb5' : 'u',
814        b'\xb6' : 'P',
815        b'\xb7' : '*',
816        b'\xb8' : ',',
817        b'\xb9' : '1',
818        b'\xba' : '(th)',
819        b'\xbb' : '>>',
820        b'\xbc' : '1/4',
821        b'\xbd' : '1/2',
822        b'\xbe' : '3/4',
823        b'\xbf' : '?',
824        b'\xc0' : 'A',
825        b'\xc1' : 'A',
826        b'\xc2' : 'A',
827        b'\xc3' : 'A',
828        b'\xc4' : 'A',
829        b'\xc5' : 'A',
830        b'\xc6' : 'AE',
831        b'\xc7' : 'C',
832        b'\xc8' : 'E',
833        b'\xc9' : 'E',
834        b'\xca' : 'E',
835        b'\xcb' : 'E',
836        b'\xcc' : 'I',
837        b'\xcd' : 'I',
838        b'\xce' : 'I',
839        b'\xcf' : 'I',
840        b'\xd0' : 'D',
841        b'\xd1' : 'N',
842        b'\xd2' : 'O',
843        b'\xd3' : 'O',
844        b'\xd4' : 'O',
845        b'\xd5' : 'O',
846        b'\xd6' : 'O',
847        b'\xd7' : '*',
848        b'\xd8' : 'O',
849        b'\xd9' : 'U',
850        b'\xda' : 'U',
851        b'\xdb' : 'U',
852        b'\xdc' : 'U',
853        b'\xdd' : 'Y',
854        b'\xde' : 'b',
855        b'\xdf' : 'B',
856        b'\xe0' : 'a',
857        b'\xe1' : 'a',
858        b'\xe2' : 'a',
859        b'\xe3' : 'a',
860        b'\xe4' : 'a',
861        b'\xe5' : 'a',
862        b'\xe6' : 'ae',
863        b'\xe7' : 'c',
864        b'\xe8' : 'e',
865        b'\xe9' : 'e',
866        b'\xea' : 'e',
867        b'\xeb' : 'e',
868        b'\xec' : 'i',
869        b'\xed' : 'i',
870        b'\xee' : 'i',
871        b'\xef' : 'i',
872        b'\xf0' : 'o',
873        b'\xf1' : 'n',
874        b'\xf2' : 'o',
875        b'\xf3' : 'o',
876        b'\xf4' : 'o',
877        b'\xf5' : 'o',
878        b'\xf6' : 'o',
879        b'\xf7' : '/',
880        b'\xf8' : 'o',
881        b'\xf9' : 'u',
882        b'\xfa' : 'u',
883        b'\xfb' : 'u',
884        b'\xfc' : 'u',
885        b'\xfd' : 'y',
886        b'\xfe' : 'b',
887        b'\xff' : 'y',
888        }
890    # A map used when removing rogue Windows-1252/ISO-8859-1
891    # characters in otherwise UTF-8 documents.
892    #
893    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
894    # Windows-1252.
895    WINDOWS_1252_TO_UTF8 = {
896        0x80 : b'\xe2\x82\xac', # €
897        0x82 : b'\xe2\x80\x9a', # ‚
898        0x83 : b'\xc6\x92',     # ƒ
899        0x84 : b'\xe2\x80\x9e', # „
900        0x85 : b'\xe2\x80\xa6', # …
901        0x86 : b'\xe2\x80\xa0', # †
902        0x87 : b'\xe2\x80\xa1', # ‡
903        0x88 : b'\xcb\x86',     # ˆ
904        0x89 : b'\xe2\x80\xb0', # ‰
905        0x8a : b'\xc5\xa0',     # Š
906        0x8b : b'\xe2\x80\xb9', # ‹
907        0x8c : b'\xc5\x92',     # Œ
908        0x8e : b'\xc5\xbd',     # Ž
909        0x91 : b'\xe2\x80\x98', # ‘
910        0x92 : b'\xe2\x80\x99', # ’
911        0x93 : b'\xe2\x80\x9c', # “
912        0x94 : b'\xe2\x80\x9d', # ”
913        0x95 : b'\xe2\x80\xa2', # •
914        0x96 : b'\xe2\x80\x93', # –
915        0x97 : b'\xe2\x80\x94', # —
916        0x98 : b'\xcb\x9c',     # ˜
917        0x99 : b'\xe2\x84\xa2', # ™
918        0x9a : b'\xc5\xa1',     # š
919        0x9b : b'\xe2\x80\xba', # ›
920        0x9c : b'\xc5\x93',     # œ
921        0x9e : b'\xc5\xbe',     # ž
922        0x9f : b'\xc5\xb8',     # Ÿ
923        0xa0 : b'\xc2\xa0',     #  
924        0xa1 : b'\xc2\xa1',     # ¡
925        0xa2 : b'\xc2\xa2',     # ¢
926        0xa3 : b'\xc2\xa3',     # £
927        0xa4 : b'\xc2\xa4',     # ¤
928        0xa5 : b'\xc2\xa5',     # ¥
929        0xa6 : b'\xc2\xa6',     # ¦
930        0xa7 : b'\xc2\xa7',     # §
931        0xa8 : b'\xc2\xa8',     # ¨
932        0xa9 : b'\xc2\xa9',     # ©
933        0xaa : b'\xc2\xaa',     # ª
934        0xab : b'\xc2\xab',     # «
935        0xac : b'\xc2\xac',     # ¬
936        0xad : b'\xc2\xad',     # ­
937        0xae : b'\xc2\xae',     # ®
938        0xaf : b'\xc2\xaf',     # ¯
939        0xb0 : b'\xc2\xb0',     # °
940        0xb1 : b'\xc2\xb1',     # ±
941        0xb2 : b'\xc2\xb2',     # ²
942        0xb3 : b'\xc2\xb3',     # ³
943        0xb4 : b'\xc2\xb4',     # ´
944        0xb5 : b'\xc2\xb5',     # µ
945        0xb6 : b'\xc2\xb6',     # ¶
946        0xb7 : b'\xc2\xb7',     # ·
947        0xb8 : b'\xc2\xb8',     # ¸
948        0xb9 : b'\xc2\xb9',     # ¹
949        0xba : b'\xc2\xba',     # º
950        0xbb : b'\xc2\xbb',     # »
951        0xbc : b'\xc2\xbc',     # ¼
952        0xbd : b'\xc2\xbd',     # ½
953        0xbe : b'\xc2\xbe',     # ¾
954        0xbf : b'\xc2\xbf',     # ¿
955        0xc0 : b'\xc3\x80',     # À
956        0xc1 : b'\xc3\x81',     # Á
957        0xc2 : b'\xc3\x82',     # Â
958        0xc3 : b'\xc3\x83',     # Ã
959        0xc4 : b'\xc3\x84',     # Ä
960        0xc5 : b'\xc3\x85',     # Å
961        0xc6 : b'\xc3\x86',     # Æ
962        0xc7 : b'\xc3\x87',     # Ç
963        0xc8 : b'\xc3\x88',     # È
964        0xc9 : b'\xc3\x89',     # É
965        0xca : b'\xc3\x8a',     # Ê
966        0xcb : b'\xc3\x8b',     # Ë
967        0xcc : b'\xc3\x8c',     # Ì
968        0xcd : b'\xc3\x8d',     # Í
969        0xce : b'\xc3\x8e',     # Î
970        0xcf : b'\xc3\x8f',     # Ï
971        0xd0 : b'\xc3\x90',     # Ð
972        0xd1 : b'\xc3\x91',     # Ñ
973        0xd2 : b'\xc3\x92',     # Ò
974        0xd3 : b'\xc3\x93',     # Ó
975        0xd4 : b'\xc3\x94',     # Ô
976        0xd5 : b'\xc3\x95',     # Õ
977        0xd6 : b'\xc3\x96',     # Ö
978        0xd7 : b'\xc3\x97',     # ×
979        0xd8 : b'\xc3\x98',     # Ø
980        0xd9 : b'\xc3\x99',     # Ù
981        0xda : b'\xc3\x9a',     # Ú
982        0xdb : b'\xc3\x9b',     # Û
983        0xdc : b'\xc3\x9c',     # Ü
984        0xdd : b'\xc3\x9d',     # Ý
985        0xde : b'\xc3\x9e',     # Þ
986        0xdf : b'\xc3\x9f',     # ß
987        0xe0 : b'\xc3\xa0',     # à
988        0xe1 : b'\xa1',         # á
989        0xe2 : b'\xc3\xa2',     # â
990        0xe3 : b'\xc3\xa3',     # ã
991        0xe4 : b'\xc3\xa4',     # ä
992        0xe5 : b'\xc3\xa5',     # å
993        0xe6 : b'\xc3\xa6',     # æ
994        0xe7 : b'\xc3\xa7',     # ç
995        0xe8 : b'\xc3\xa8',     # è
996        0xe9 : b'\xc3\xa9',     # é
997        0xea : b'\xc3\xaa',     # ê
998        0xeb : b'\xc3\xab',     # ë
999        0xec : b'\xc3\xac',     # ì
1000        0xed : b'\xc3\xad',     # í
1001        0xee : b'\xc3\xae',     # î
1002        0xef : b'\xc3\xaf',     # ï
1003        0xf0 : b'\xc3\xb0',     # ð
1004        0xf1 : b'\xc3\xb1',     # ñ
1005        0xf2 : b'\xc3\xb2',     # ò
1006        0xf3 : b'\xc3\xb3',     # ó
1007        0xf4 : b'\xc3\xb4',     # ô
1008        0xf5 : b'\xc3\xb5',     # õ
1009        0xf6 : b'\xc3\xb6',     # ö
1010        0xf7 : b'\xc3\xb7',     # ÷
1011        0xf8 : b'\xc3\xb8',     # ø
1012        0xf9 : b'\xc3\xb9',     # ù
1013        0xfa : b'\xc3\xba',     # ú
1014        0xfb : b'\xc3\xbb',     # û
1015        0xfc : b'\xc3\xbc',     # ü
1016        0xfd : b'\xc3\xbd',     # ý
1017        0xfe : b'\xc3\xbe',     # þ
1018        }
1021        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
1022        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
1023        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
1024        ]
1029    @classmethod
1030    def detwingle(cls, in_bytes, main_encoding="utf8",
1031                  embedded_encoding="windows-1252"):
1032        """Fix characters from one encoding embedded in some other encoding.
1034        Currently the only situation supported is Windows-1252 (or its
1035        subset ISO-8859-1), embedded in UTF-8.
1037        :param in_bytes: A bytestring that you suspect contains
1038            characters from multiple encodings. Note that this _must_
1039            be a bytestring. If you've already converted the document
1040            to Unicode, you're too late.
1041        :param main_encoding: The primary encoding of `in_bytes`.
1042        :param embedded_encoding: The encoding that was used to embed characters
1043            in the main document.
1044        :return: A bytestring in which `embedded_encoding`
1045          characters have been converted to their `main_encoding`
1046          equivalents.
1047        """
1048        if embedded_encoding.replace('_', '-').lower() not in (
1049            'windows-1252', 'windows_1252'):
1050            raise NotImplementedError(
1051                "Windows-1252 and ISO-8859-1 are the only currently supported "
1052                "embedded encodings.")
1054        if main_encoding.lower() not in ('utf8', 'utf-8'):
1055            raise NotImplementedError(
1056                "UTF-8 is the only currently supported main encoding.")
1058        byte_chunks = []
1060        chunk_start = 0
1061        pos = 0
1062        while pos < len(in_bytes):
1063            byte = in_bytes[pos]
1064            if not isinstance(byte, int):
1065                # Python 2.x
1066                byte = ord(byte)
1067            if (byte >= cls.FIRST_MULTIBYTE_MARKER
1068                and byte <= cls.LAST_MULTIBYTE_MARKER):
1069                # This is the start of a UTF-8 multibyte character. Skip
1070                # to the end.
1071                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
1072                    if byte >= start and byte <= end:
1073                        pos += size
1074                        break
1075            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
1076                # We found a Windows-1252 character!
1077                # Save the string up to this point as a chunk.
1078                byte_chunks.append(in_bytes[chunk_start:pos])
1080                # Now translate the Windows-1252 character into UTF-8
1081                # and add it as another, one-byte chunk.
1082                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
1083                pos += 1
1084                chunk_start = pos
1085            else:
1086                # Go on to the next character.
1087                pos += 1
1088        if chunk_start == 0:
1089            # The string is unchanged.
1090            return in_bytes
1091        else:
1092            # Store the final chunk.
1093            byte_chunks.append(in_bytes[chunk_start:])
1094        return b''.join(byte_chunks)