xref: /openbmc/openbmc/poky/bitbake/lib/bs4/dammit.py (revision eb8dc403)
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
9__license__ = "MIT"
10
11from pdb import set_trace
12import codecs
13from html.entities import codepoint2name
14import re
15import logging
16import string
17
18# Import a library to autodetect character encodings.
19chardet_type = None
20try:
21    # First try the fast C implementation.
22    #  PyPI package: cchardet
23    import cchardet
24    def chardet_dammit(s):
25        return cchardet.detect(s)['encoding']
26except ImportError:
27    try:
28        # Fall back to the pure Python implementation
29        #  Debian package: python-chardet
30        #  PyPI package: chardet
31        import chardet
32        def chardet_dammit(s):
33            return chardet.detect(s)['encoding']
34        #import chardet.constants
35        #chardet.constants._debug = 1
36    except ImportError:
37        # No chardet available.
38        def chardet_dammit(s):
39            return None
40
41# Available from http://cjkpython.i18n.org/.
42try:
43    import iconv_codec
44except ImportError:
45    pass
46
47xml_encoding_re = re.compile(
48    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
49html_meta_re = re.compile(
50    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
51
52class EntitySubstitution(object):
53
54    """Substitute XML or HTML entities for the corresponding characters."""
55
56    def _populate_class_variables():
57        lookup = {}
58        reverse_lookup = {}
59        characters_for_re = []
60        for codepoint, name in list(codepoint2name.items()):
61            character = chr(codepoint)
62            if codepoint != 34:
63                # There's no point in turning the quotation mark into
64                # &quot;, unless it happens within an attribute value, which
65                # is handled elsewhere.
66                characters_for_re.append(character)
67                lookup[character] = name
68            # But we do want to turn &quot; into the quotation mark.
69            reverse_lookup[name] = character
70        re_definition = "[%s]" % "".join(characters_for_re)
71        return lookup, reverse_lookup, re.compile(re_definition)
72    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
73     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
74
75    CHARACTER_TO_XML_ENTITY = {
76        "'": "apos",
77        '"': "quot",
78        "&": "amp",
79        "<": "lt",
80        ">": "gt",
81        }
82
83    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
84                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
85                                           ")")
86
87    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
88
89    @classmethod
90    def _substitute_html_entity(cls, matchobj):
91        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
92        return "&%s;" % entity
93
94    @classmethod
95    def _substitute_xml_entity(cls, matchobj):
96        """Used with a regular expression to substitute the
97        appropriate XML entity for an XML special character."""
98        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
99        return "&%s;" % entity
100
101    @classmethod
102    def quoted_attribute_value(self, value):
103        """Make a value into a quoted XML attribute, possibly escaping it.
104
105         Most strings will be quoted using double quotes.
106
107          Bob's Bar -> "Bob's Bar"
108
109         If a string contains double quotes, it will be quoted using
110         single quotes.
111
112          Welcome to "my bar" -> 'Welcome to "my bar"'
113
114         If a string contains both single and double quotes, the
115         double quotes will be escaped, and the string will be quoted
116         using double quotes.
117
118          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
119        """
120        quote_with = '"'
121        if '"' in value:
122            if "'" in value:
123                # The string contains both single and double
124                # quotes.  Turn the double quotes into
125                # entities. We quote the double quotes rather than
126                # the single quotes because the entity name is
127                # "&quot;" whether this is HTML or XML.  If we
128                # quoted the single quotes, we'd have to decide
129                # between &apos; and &squot;.
130                replace_with = "&quot;"
131                value = value.replace('"', replace_with)
132            else:
133                # There are double quotes but no single quotes.
134                # We can use single quotes to quote the attribute.
135                quote_with = "'"
136        return quote_with + value + quote_with
137
138    @classmethod
139    def substitute_xml(cls, value, make_quoted_attribute=False):
140        """Substitute XML entities for special XML characters.
141
142        :param value: A string to be substituted. The less-than sign
143          will become &lt;, the greater-than sign will become &gt;,
144          and any ampersands will become &amp;. If you want ampersands
145          that appear to be part of an entity definition to be left
146          alone, use substitute_xml_containing_entities() instead.
147
148        :param make_quoted_attribute: If True, then the string will be
149         quoted, as befits an attribute value.
150        """
151        # Escape angle brackets and ampersands.
152        value = cls.AMPERSAND_OR_BRACKET.sub(
153            cls._substitute_xml_entity, value)
154
155        if make_quoted_attribute:
156            value = cls.quoted_attribute_value(value)
157        return value
158
159    @classmethod
160    def substitute_xml_containing_entities(
161        cls, value, make_quoted_attribute=False):
162        """Substitute XML entities for special XML characters.
163
164        :param value: A string to be substituted. The less-than sign will
165          become &lt;, the greater-than sign will become &gt;, and any
166          ampersands that are not part of an entity defition will
167          become &amp;.
168
169        :param make_quoted_attribute: If True, then the string will be
170         quoted, as befits an attribute value.
171        """
172        # Escape angle brackets, and ampersands that aren't part of
173        # entities.
174        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
175            cls._substitute_xml_entity, value)
176
177        if make_quoted_attribute:
178            value = cls.quoted_attribute_value(value)
179        return value
180
181    @classmethod
182    def substitute_html(cls, s):
183        """Replace certain Unicode characters with named HTML entities.
184
185        This differs from data.encode(encoding, 'xmlcharrefreplace')
186        in that the goal is to make the result more readable (to those
187        with ASCII displays) rather than to recover from
188        errors. There's absolutely nothing wrong with a UTF-8 string
189        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
190        character with "&eacute;" will make it more readable to some
191        people.
192        """
193        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
194            cls._substitute_html_entity, s)
195
196
197class EncodingDetector:
198    """Suggests a number of possible encodings for a bytestring.
199
200    Order of precedence:
201
202    1. Encodings you specifically tell EncodingDetector to try first
203    (the override_encodings argument to the constructor).
204
205    2. An encoding declared within the bytestring itself, either in an
206    XML declaration (if the bytestring is to be interpreted as an XML
207    document), or in a <meta> tag (if the bytestring is to be
208    interpreted as an HTML document.)
209
210    3. An encoding detected through textual analysis by chardet,
211    cchardet, or a similar external library.
212
213    4. UTF-8.
214
215    5. Windows-1252.
216    """
217    def __init__(self, markup, override_encodings=None, is_html=False,
218                 exclude_encodings=None):
219        self.override_encodings = override_encodings or []
220        exclude_encodings = exclude_encodings or []
221        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
222        self.chardet_encoding = None
223        self.is_html = is_html
224        self.declared_encoding = None
225
226        # First order of business: strip a byte-order mark.
227        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
228
229    def _usable(self, encoding, tried):
230        if encoding is not None:
231            encoding = encoding.lower()
232            if encoding in self.exclude_encodings:
233                return False
234            if encoding not in tried:
235                tried.add(encoding)
236                return True
237        return False
238
239    @property
240    def encodings(self):
241        """Yield a number of encodings that might work for this markup."""
242        tried = set()
243        for e in self.override_encodings:
244            if self._usable(e, tried):
245                yield e
246
247        # Did the document originally start with a byte-order mark
248        # that indicated its encoding?
249        if self._usable(self.sniffed_encoding, tried):
250            yield self.sniffed_encoding
251
252        # Look within the document for an XML or HTML encoding
253        # declaration.
254        if self.declared_encoding is None:
255            self.declared_encoding = self.find_declared_encoding(
256                self.markup, self.is_html)
257        if self._usable(self.declared_encoding, tried):
258            yield self.declared_encoding
259
260        # Use third-party character set detection to guess at the
261        # encoding.
262        if self.chardet_encoding is None:
263            self.chardet_encoding = chardet_dammit(self.markup)
264        if self._usable(self.chardet_encoding, tried):
265            yield self.chardet_encoding
266
267        # As a last-ditch effort, try utf-8 and windows-1252.
268        for e in ('utf-8', 'windows-1252'):
269            if self._usable(e, tried):
270                yield e
271
272    @classmethod
273    def strip_byte_order_mark(cls, data):
274        """If a byte-order mark is present, strip it and return the encoding it implies."""
275        encoding = None
276        if isinstance(data, str):
277            # Unicode data cannot have a byte-order mark.
278            return data, encoding
279        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
280               and (data[2:4] != '\x00\x00'):
281            encoding = 'utf-16be'
282            data = data[2:]
283        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
284                 and (data[2:4] != '\x00\x00'):
285            encoding = 'utf-16le'
286            data = data[2:]
287        elif data[:3] == b'\xef\xbb\xbf':
288            encoding = 'utf-8'
289            data = data[3:]
290        elif data[:4] == b'\x00\x00\xfe\xff':
291            encoding = 'utf-32be'
292            data = data[4:]
293        elif data[:4] == b'\xff\xfe\x00\x00':
294            encoding = 'utf-32le'
295            data = data[4:]
296        return data, encoding
297
298    @classmethod
299    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
300        """Given a document, tries to find its declared encoding.
301
302        An XML encoding is declared at the beginning of the document.
303
304        An HTML encoding is declared in a <meta> tag, hopefully near the
305        beginning of the document.
306        """
307        if search_entire_document:
308            xml_endpos = html_endpos = len(markup)
309        else:
310            xml_endpos = 1024
311            html_endpos = max(2048, int(len(markup) * 0.05))
312
313        declared_encoding = None
314        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
315        if not declared_encoding_match and is_html:
316            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
317        if declared_encoding_match is not None:
318            declared_encoding = declared_encoding_match.groups()[0].decode(
319                'ascii', 'replace')
320        if declared_encoding:
321            return declared_encoding.lower()
322        return None
323
324class UnicodeDammit:
325    """A class for detecting the encoding of a *ML document and
326    converting it to a Unicode string. If the source encoding is
327    windows-1252, can replace MS smart quotes with their HTML or XML
328    equivalents."""
329
330    # This dictionary maps commonly seen values for "charset" in HTML
331    # meta tags to the corresponding Python codec names. It only covers
332    # values that aren't in Python's aliases and can't be determined
333    # by the heuristics in find_codec.
334    CHARSET_ALIASES = {"macintosh": "mac-roman",
335                       "x-sjis": "shift-jis"}
336
337    ENCODINGS_WITH_SMART_QUOTES = [
338        "windows-1252",
339        "iso-8859-1",
340        "iso-8859-2",
341        ]
342
343    def __init__(self, markup, override_encodings=[],
344                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
345        self.smart_quotes_to = smart_quotes_to
346        self.tried_encodings = []
347        self.contains_replacement_characters = False
348        self.is_html = is_html
349
350        self.detector = EncodingDetector(
351            markup, override_encodings, is_html, exclude_encodings)
352
353        # Short-circuit if the data is in Unicode to begin with.
354        if isinstance(markup, str) or markup == '':
355            self.markup = markup
356            self.unicode_markup = str(markup)
357            self.original_encoding = None
358            return
359
360        # The encoding detector may have stripped a byte-order mark.
361        # Use the stripped markup from this point on.
362        self.markup = self.detector.markup
363
364        u = None
365        for encoding in self.detector.encodings:
366            markup = self.detector.markup
367            u = self._convert_from(encoding)
368            if u is not None:
369                break
370
371        if not u:
372            # None of the encodings worked. As an absolute last resort,
373            # try them again with character replacement.
374
375            for encoding in self.detector.encodings:
376                if encoding != "ascii":
377                    u = self._convert_from(encoding, "replace")
378                if u is not None:
379                    logging.warning(
380                            "Some characters could not be decoded, and were "
381                            "replaced with REPLACEMENT CHARACTER.")
382                    self.contains_replacement_characters = True
383                    break
384
385        # If none of that worked, we could at this point force it to
386        # ASCII, but that would destroy so much data that I think
387        # giving up is better.
388        self.unicode_markup = u
389        if not u:
390            self.original_encoding = None
391
392    def _sub_ms_char(self, match):
393        """Changes a MS smart quote character to an XML or HTML
394        entity, or an ASCII character."""
395        orig = match.group(1)
396        if self.smart_quotes_to == 'ascii':
397            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
398        else:
399            sub = self.MS_CHARS.get(orig)
400            if type(sub) == tuple:
401                if self.smart_quotes_to == 'xml':
402                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
403                else:
404                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
405            else:
406                sub = sub.encode()
407        return sub
408
409    def _convert_from(self, proposed, errors="strict"):
410        proposed = self.find_codec(proposed)
411        if not proposed or (proposed, errors) in self.tried_encodings:
412            return None
413        self.tried_encodings.append((proposed, errors))
414        markup = self.markup
415        # Convert smart quotes to HTML if coming from an encoding
416        # that might have them.
417        if (self.smart_quotes_to is not None
418            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
419            smart_quotes_re = b"([\x80-\x9f])"
420            smart_quotes_compiled = re.compile(smart_quotes_re)
421            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
422
423        try:
424            #print "Trying to convert document to %s (errors=%s)" % (
425            #    proposed, errors)
426            u = self._to_unicode(markup, proposed, errors)
427            self.markup = u
428            self.original_encoding = proposed
429        except Exception as e:
430            #print "That didn't work!"
431            #print e
432            return None
433        #print "Correct encoding: %s" % proposed
434        return self.markup
435
436    def _to_unicode(self, data, encoding, errors="strict"):
437        '''Given a string and its encoding, decodes the string into Unicode.
438        %encoding is a string recognized by encodings.aliases'''
439        return str(data, encoding, errors)
440
441    @property
442    def declared_html_encoding(self):
443        if not self.is_html:
444            return None
445        return self.detector.declared_encoding
446
447    def find_codec(self, charset):
448        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
449               or (charset and self._codec(charset.replace("-", "")))
450               or (charset and self._codec(charset.replace("-", "_")))
451               or (charset and charset.lower())
452               or charset
453                )
454        if value:
455            return value.lower()
456        return None
457
458    def _codec(self, charset):
459        if not charset:
460            return charset
461        codec = None
462        try:
463            codecs.lookup(charset)
464            codec = charset
465        except (LookupError, ValueError):
466            pass
467        return codec
468
469
470    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
471    MS_CHARS = {b'\x80': ('euro', '20AC'),
472                b'\x81': ' ',
473                b'\x82': ('sbquo', '201A'),
474                b'\x83': ('fnof', '192'),
475                b'\x84': ('bdquo', '201E'),
476                b'\x85': ('hellip', '2026'),
477                b'\x86': ('dagger', '2020'),
478                b'\x87': ('Dagger', '2021'),
479                b'\x88': ('circ', '2C6'),
480                b'\x89': ('permil', '2030'),
481                b'\x8A': ('Scaron', '160'),
482                b'\x8B': ('lsaquo', '2039'),
483                b'\x8C': ('OElig', '152'),
484                b'\x8D': '?',
485                b'\x8E': ('#x17D', '17D'),
486                b'\x8F': '?',
487                b'\x90': '?',
488                b'\x91': ('lsquo', '2018'),
489                b'\x92': ('rsquo', '2019'),
490                b'\x93': ('ldquo', '201C'),
491                b'\x94': ('rdquo', '201D'),
492                b'\x95': ('bull', '2022'),
493                b'\x96': ('ndash', '2013'),
494                b'\x97': ('mdash', '2014'),
495                b'\x98': ('tilde', '2DC'),
496                b'\x99': ('trade', '2122'),
497                b'\x9a': ('scaron', '161'),
498                b'\x9b': ('rsaquo', '203A'),
499                b'\x9c': ('oelig', '153'),
500                b'\x9d': '?',
501                b'\x9e': ('#x17E', '17E'),
502                b'\x9f': ('Yuml', ''),}
503
504    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
505    # horrors like stripping diacritical marks to turn á into a, but also
506    # contains non-horrors like turning “ into ".
507    MS_CHARS_TO_ASCII = {
508        b'\x80' : 'EUR',
509        b'\x81' : ' ',
510        b'\x82' : ',',
511        b'\x83' : 'f',
512        b'\x84' : ',,',
513        b'\x85' : '...',
514        b'\x86' : '+',
515        b'\x87' : '++',
516        b'\x88' : '^',
517        b'\x89' : '%',
518        b'\x8a' : 'S',
519        b'\x8b' : '<',
520        b'\x8c' : 'OE',
521        b'\x8d' : '?',
522        b'\x8e' : 'Z',
523        b'\x8f' : '?',
524        b'\x90' : '?',
525        b'\x91' : "'",
526        b'\x92' : "'",
527        b'\x93' : '"',
528        b'\x94' : '"',
529        b'\x95' : '*',
530        b'\x96' : '-',
531        b'\x97' : '--',
532        b'\x98' : '~',
533        b'\x99' : '(TM)',
534        b'\x9a' : 's',
535        b'\x9b' : '>',
536        b'\x9c' : 'oe',
537        b'\x9d' : '?',
538        b'\x9e' : 'z',
539        b'\x9f' : 'Y',
540        b'\xa0' : ' ',
541        b'\xa1' : '!',
542        b'\xa2' : 'c',
543        b'\xa3' : 'GBP',
544        b'\xa4' : '$', #This approximation is especially parochial--this is the
545                       #generic currency symbol.
546        b'\xa5' : 'YEN',
547        b'\xa6' : '|',
548        b'\xa7' : 'S',
549        b'\xa8' : '..',
550        b'\xa9' : '',
551        b'\xaa' : '(th)',
552        b'\xab' : '<<',
553        b'\xac' : '!',
554        b'\xad' : ' ',
555        b'\xae' : '(R)',
556        b'\xaf' : '-',
557        b'\xb0' : 'o',
558        b'\xb1' : '+-',
559        b'\xb2' : '2',
560        b'\xb3' : '3',
561        b'\xb4' : ("'", 'acute'),
562        b'\xb5' : 'u',
563        b'\xb6' : 'P',
564        b'\xb7' : '*',
565        b'\xb8' : ',',
566        b'\xb9' : '1',
567        b'\xba' : '(th)',
568        b'\xbb' : '>>',
569        b'\xbc' : '1/4',
570        b'\xbd' : '1/2',
571        b'\xbe' : '3/4',
572        b'\xbf' : '?',
573        b'\xc0' : 'A',
574        b'\xc1' : 'A',
575        b'\xc2' : 'A',
576        b'\xc3' : 'A',
577        b'\xc4' : 'A',
578        b'\xc5' : 'A',
579        b'\xc6' : 'AE',
580        b'\xc7' : 'C',
581        b'\xc8' : 'E',
582        b'\xc9' : 'E',
583        b'\xca' : 'E',
584        b'\xcb' : 'E',
585        b'\xcc' : 'I',
586        b'\xcd' : 'I',
587        b'\xce' : 'I',
588        b'\xcf' : 'I',
589        b'\xd0' : 'D',
590        b'\xd1' : 'N',
591        b'\xd2' : 'O',
592        b'\xd3' : 'O',
593        b'\xd4' : 'O',
594        b'\xd5' : 'O',
595        b'\xd6' : 'O',
596        b'\xd7' : '*',
597        b'\xd8' : 'O',
598        b'\xd9' : 'U',
599        b'\xda' : 'U',
600        b'\xdb' : 'U',
601        b'\xdc' : 'U',
602        b'\xdd' : 'Y',
603        b'\xde' : 'b',
604        b'\xdf' : 'B',
605        b'\xe0' : 'a',
606        b'\xe1' : 'a',
607        b'\xe2' : 'a',
608        b'\xe3' : 'a',
609        b'\xe4' : 'a',
610        b'\xe5' : 'a',
611        b'\xe6' : 'ae',
612        b'\xe7' : 'c',
613        b'\xe8' : 'e',
614        b'\xe9' : 'e',
615        b'\xea' : 'e',
616        b'\xeb' : 'e',
617        b'\xec' : 'i',
618        b'\xed' : 'i',
619        b'\xee' : 'i',
620        b'\xef' : 'i',
621        b'\xf0' : 'o',
622        b'\xf1' : 'n',
623        b'\xf2' : 'o',
624        b'\xf3' : 'o',
625        b'\xf4' : 'o',
626        b'\xf5' : 'o',
627        b'\xf6' : 'o',
628        b'\xf7' : '/',
629        b'\xf8' : 'o',
630        b'\xf9' : 'u',
631        b'\xfa' : 'u',
632        b'\xfb' : 'u',
633        b'\xfc' : 'u',
634        b'\xfd' : 'y',
635        b'\xfe' : 'b',
636        b'\xff' : 'y',
637        }
638
639    # A map used when removing rogue Windows-1252/ISO-8859-1
640    # characters in otherwise UTF-8 documents.
641    #
642    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
643    # Windows-1252.
644    WINDOWS_1252_TO_UTF8 = {
645        0x80 : b'\xe2\x82\xac', # €
646        0x82 : b'\xe2\x80\x9a', # ‚
647        0x83 : b'\xc6\x92',     # ƒ
648        0x84 : b'\xe2\x80\x9e', # „
649        0x85 : b'\xe2\x80\xa6', # …
650        0x86 : b'\xe2\x80\xa0', # †
651        0x87 : b'\xe2\x80\xa1', # ‡
652        0x88 : b'\xcb\x86',     # ˆ
653        0x89 : b'\xe2\x80\xb0', # ‰
654        0x8a : b'\xc5\xa0',     # Š
655        0x8b : b'\xe2\x80\xb9', # ‹
656        0x8c : b'\xc5\x92',     # Œ
657        0x8e : b'\xc5\xbd',     # Ž
658        0x91 : b'\xe2\x80\x98', # ‘
659        0x92 : b'\xe2\x80\x99', # ’
660        0x93 : b'\xe2\x80\x9c', # “
661        0x94 : b'\xe2\x80\x9d', # ”
662        0x95 : b'\xe2\x80\xa2', # •
663        0x96 : b'\xe2\x80\x93', # –
664        0x97 : b'\xe2\x80\x94', # —
665        0x98 : b'\xcb\x9c',     # ˜
666        0x99 : b'\xe2\x84\xa2', # ™
667        0x9a : b'\xc5\xa1',     # š
668        0x9b : b'\xe2\x80\xba', # ›
669        0x9c : b'\xc5\x93',     # œ
670        0x9e : b'\xc5\xbe',     # ž
671        0x9f : b'\xc5\xb8',     # Ÿ
672        0xa0 : b'\xc2\xa0',     #  
673        0xa1 : b'\xc2\xa1',     # ¡
674        0xa2 : b'\xc2\xa2',     # ¢
675        0xa3 : b'\xc2\xa3',     # £
676        0xa4 : b'\xc2\xa4',     # ¤
677        0xa5 : b'\xc2\xa5',     # ¥
678        0xa6 : b'\xc2\xa6',     # ¦
679        0xa7 : b'\xc2\xa7',     # §
680        0xa8 : b'\xc2\xa8',     # ¨
681        0xa9 : b'\xc2\xa9',     # ©
682        0xaa : b'\xc2\xaa',     # ª
683        0xab : b'\xc2\xab',     # «
684        0xac : b'\xc2\xac',     # ¬
685        0xad : b'\xc2\xad',     # ­
686        0xae : b'\xc2\xae',     # ®
687        0xaf : b'\xc2\xaf',     # ¯
688        0xb0 : b'\xc2\xb0',     # °
689        0xb1 : b'\xc2\xb1',     # ±
690        0xb2 : b'\xc2\xb2',     # ²
691        0xb3 : b'\xc2\xb3',     # ³
692        0xb4 : b'\xc2\xb4',     # ´
693        0xb5 : b'\xc2\xb5',     # µ
694        0xb6 : b'\xc2\xb6',     # ¶
695        0xb7 : b'\xc2\xb7',     # ·
696        0xb8 : b'\xc2\xb8',     # ¸
697        0xb9 : b'\xc2\xb9',     # ¹
698        0xba : b'\xc2\xba',     # º
699        0xbb : b'\xc2\xbb',     # »
700        0xbc : b'\xc2\xbc',     # ¼
701        0xbd : b'\xc2\xbd',     # ½
702        0xbe : b'\xc2\xbe',     # ¾
703        0xbf : b'\xc2\xbf',     # ¿
704        0xc0 : b'\xc3\x80',     # À
705        0xc1 : b'\xc3\x81',     # Á
706        0xc2 : b'\xc3\x82',     # Â
707        0xc3 : b'\xc3\x83',     # Ã
708        0xc4 : b'\xc3\x84',     # Ä
709        0xc5 : b'\xc3\x85',     # Å
710        0xc6 : b'\xc3\x86',     # Æ
711        0xc7 : b'\xc3\x87',     # Ç
712        0xc8 : b'\xc3\x88',     # È
713        0xc9 : b'\xc3\x89',     # É
714        0xca : b'\xc3\x8a',     # Ê
715        0xcb : b'\xc3\x8b',     # Ë
716        0xcc : b'\xc3\x8c',     # Ì
717        0xcd : b'\xc3\x8d',     # Í
718        0xce : b'\xc3\x8e',     # Î
719        0xcf : b'\xc3\x8f',     # Ï
720        0xd0 : b'\xc3\x90',     # Ð
721        0xd1 : b'\xc3\x91',     # Ñ
722        0xd2 : b'\xc3\x92',     # Ò
723        0xd3 : b'\xc3\x93',     # Ó
724        0xd4 : b'\xc3\x94',     # Ô
725        0xd5 : b'\xc3\x95',     # Õ
726        0xd6 : b'\xc3\x96',     # Ö
727        0xd7 : b'\xc3\x97',     # ×
728        0xd8 : b'\xc3\x98',     # Ø
729        0xd9 : b'\xc3\x99',     # Ù
730        0xda : b'\xc3\x9a',     # Ú
731        0xdb : b'\xc3\x9b',     # Û
732        0xdc : b'\xc3\x9c',     # Ü
733        0xdd : b'\xc3\x9d',     # Ý
734        0xde : b'\xc3\x9e',     # Þ
735        0xdf : b'\xc3\x9f',     # ß
736        0xe0 : b'\xc3\xa0',     # à
737        0xe1 : b'\xa1',     # á
738        0xe2 : b'\xc3\xa2',     # â
739        0xe3 : b'\xc3\xa3',     # ã
740        0xe4 : b'\xc3\xa4',     # ä
741        0xe5 : b'\xc3\xa5',     # å
742        0xe6 : b'\xc3\xa6',     # æ
743        0xe7 : b'\xc3\xa7',     # ç
744        0xe8 : b'\xc3\xa8',     # è
745        0xe9 : b'\xc3\xa9',     # é
746        0xea : b'\xc3\xaa',     # ê
747        0xeb : b'\xc3\xab',     # ë
748        0xec : b'\xc3\xac',     # ì
749        0xed : b'\xc3\xad',     # í
750        0xee : b'\xc3\xae',     # î
751        0xef : b'\xc3\xaf',     # ï
752        0xf0 : b'\xc3\xb0',     # ð
753        0xf1 : b'\xc3\xb1',     # ñ
754        0xf2 : b'\xc3\xb2',     # ò
755        0xf3 : b'\xc3\xb3',     # ó
756        0xf4 : b'\xc3\xb4',     # ô
757        0xf5 : b'\xc3\xb5',     # õ
758        0xf6 : b'\xc3\xb6',     # ö
759        0xf7 : b'\xc3\xb7',     # ÷
760        0xf8 : b'\xc3\xb8',     # ø
761        0xf9 : b'\xc3\xb9',     # ù
762        0xfa : b'\xc3\xba',     # ú
763        0xfb : b'\xc3\xbb',     # û
764        0xfc : b'\xc3\xbc',     # ü
765        0xfd : b'\xc3\xbd',     # ý
766        0xfe : b'\xc3\xbe',     # þ
767        }
768
769    MULTIBYTE_MARKERS_AND_SIZES = [
770        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
771        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
772        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
773        ]
774
775    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
776    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
777
778    @classmethod
779    def detwingle(cls, in_bytes, main_encoding="utf8",
780                  embedded_encoding="windows-1252"):
781        """Fix characters from one encoding embedded in some other encoding.
782
783        Currently the only situation supported is Windows-1252 (or its
784        subset ISO-8859-1), embedded in UTF-8.
785
786        The input must be a bytestring. If you've already converted
787        the document to Unicode, you're too late.
788
789        The output is a bytestring in which `embedded_encoding`
790        characters have been converted to their `main_encoding`
791        equivalents.
792        """
793        if embedded_encoding.replace('_', '-').lower() not in (
794            'windows-1252', 'windows_1252'):
795            raise NotImplementedError(
796                "Windows-1252 and ISO-8859-1 are the only currently supported "
797                "embedded encodings.")
798
799        if main_encoding.lower() not in ('utf8', 'utf-8'):
800            raise NotImplementedError(
801                "UTF-8 is the only currently supported main encoding.")
802
803        byte_chunks = []
804
805        chunk_start = 0
806        pos = 0
807        while pos < len(in_bytes):
808            byte = in_bytes[pos]
809            if not isinstance(byte, int):
810                # Python 2.x
811                byte = ord(byte)
812            if (byte >= cls.FIRST_MULTIBYTE_MARKER
813                and byte <= cls.LAST_MULTIBYTE_MARKER):
814                # This is the start of a UTF-8 multibyte character. Skip
815                # to the end.
816                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
817                    if byte >= start and byte <= end:
818                        pos += size
819                        break
820            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
821                # We found a Windows-1252 character!
822                # Save the string up to this point as a chunk.
823                byte_chunks.append(in_bytes[chunk_start:pos])
824
825                # Now translate the Windows-1252 character into UTF-8
826                # and add it as another, one-byte chunk.
827                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
828                pos += 1
829                chunk_start = pos
830            else:
831                # Go on to the next character.
832                pos += 1
833        if chunk_start == 0:
834            # The string is unchanged.
835            return in_bytes
836        else:
837            # Store the final chunk.
838            byte_chunks.append(in_bytes[chunk_start:])
839        return b''.join(byte_chunks)
840
841