xref: /openbmc/openbmc/poky/bitbake/lib/bs4/dammit.py (revision 82c905dc)
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
9__license__ = "MIT"
10
11import codecs
12from html.entities import codepoint2name
13import re
14import logging
15
16# Import a library to autodetect character encodings.
17chardet_type = None
18try:
19    # First try the fast C implementation.
20    #  PyPI package: cchardet
21    import cchardet
22    def chardet_dammit(s):
23        return cchardet.detect(s)['encoding']
24except ImportError:
25    try:
26        # Fall back to the pure Python implementation
27        #  Debian package: python-chardet
28        #  PyPI package: chardet
29        import chardet
30        def chardet_dammit(s):
31            return chardet.detect(s)['encoding']
32        #import chardet.constants
33        #chardet.constants._debug = 1
34    except ImportError:
35        # No chardet available.
36        def chardet_dammit(s):
37            return None
38
39xml_encoding_re = re.compile(
40    r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
41html_meta_re = re.compile(
42    r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
43
44class EntitySubstitution(object):
45
46    """Substitute XML or HTML entities for the corresponding characters."""
47
48    def _populate_class_variables():
49        lookup = {}
50        reverse_lookup = {}
51        characters_for_re = []
52        for codepoint, name in list(codepoint2name.items()):
53            character = chr(codepoint)
54            if codepoint != 34:
55                # There's no point in turning the quotation mark into
56                # &quot;, unless it happens within an attribute value, which
57                # is handled elsewhere.
58                characters_for_re.append(character)
59                lookup[character] = name
60            # But we do want to turn &quot; into the quotation mark.
61            reverse_lookup[name] = character
62        re_definition = "[%s]" % "".join(characters_for_re)
63        return lookup, reverse_lookup, re.compile(re_definition)
64    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
65     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
66
67    CHARACTER_TO_XML_ENTITY = {
68        "'": "apos",
69        '"': "quot",
70        "&": "amp",
71        "<": "lt",
72        ">": "gt",
73        }
74
75    BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|"
76                                           r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
77                                           r")")
78
79    AMPERSAND_OR_BRACKET = re.compile(r"([<>&])")
80
81    @classmethod
82    def _substitute_html_entity(cls, matchobj):
83        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
84        return "&%s;" % entity
85
86    @classmethod
87    def _substitute_xml_entity(cls, matchobj):
88        """Used with a regular expression to substitute the
89        appropriate XML entity for an XML special character."""
90        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
91        return "&%s;" % entity
92
93    @classmethod
94    def quoted_attribute_value(self, value):
95        """Make a value into a quoted XML attribute, possibly escaping it.
96
97         Most strings will be quoted using double quotes.
98
99          Bob's Bar -> "Bob's Bar"
100
101         If a string contains double quotes, it will be quoted using
102         single quotes.
103
104          Welcome to "my bar" -> 'Welcome to "my bar"'
105
106         If a string contains both single and double quotes, the
107         double quotes will be escaped, and the string will be quoted
108         using double quotes.
109
110          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
111        """
112        quote_with = '"'
113        if '"' in value:
114            if "'" in value:
115                # The string contains both single and double
116                # quotes.  Turn the double quotes into
117                # entities. We quote the double quotes rather than
118                # the single quotes because the entity name is
119                # "&quot;" whether this is HTML or XML.  If we
120                # quoted the single quotes, we'd have to decide
121                # between &apos; and &squot;.
122                replace_with = "&quot;"
123                value = value.replace('"', replace_with)
124            else:
125                # There are double quotes but no single quotes.
126                # We can use single quotes to quote the attribute.
127                quote_with = "'"
128        return quote_with + value + quote_with
129
130    @classmethod
131    def substitute_xml(cls, value, make_quoted_attribute=False):
132        """Substitute XML entities for special XML characters.
133
134        :param value: A string to be substituted. The less-than sign
135          will become &lt;, the greater-than sign will become &gt;,
136          and any ampersands will become &amp;. If you want ampersands
137          that appear to be part of an entity definition to be left
138          alone, use substitute_xml_containing_entities() instead.
139
140        :param make_quoted_attribute: If True, then the string will be
141         quoted, as befits an attribute value.
142        """
143        # Escape angle brackets and ampersands.
144        value = cls.AMPERSAND_OR_BRACKET.sub(
145            cls._substitute_xml_entity, value)
146
147        if make_quoted_attribute:
148            value = cls.quoted_attribute_value(value)
149        return value
150
151    @classmethod
152    def substitute_xml_containing_entities(
153        cls, value, make_quoted_attribute=False):
154        """Substitute XML entities for special XML characters.
155
156        :param value: A string to be substituted. The less-than sign will
157          become &lt;, the greater-than sign will become &gt;, and any
158          ampersands that are not part of an entity defition will
159          become &amp;.
160
161        :param make_quoted_attribute: If True, then the string will be
162         quoted, as befits an attribute value.
163        """
164        # Escape angle brackets, and ampersands that aren't part of
165        # entities.
166        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
167            cls._substitute_xml_entity, value)
168
169        if make_quoted_attribute:
170            value = cls.quoted_attribute_value(value)
171        return value
172
173    @classmethod
174    def substitute_html(cls, s):
175        """Replace certain Unicode characters with named HTML entities.
176
177        This differs from data.encode(encoding, 'xmlcharrefreplace')
178        in that the goal is to make the result more readable (to those
179        with ASCII displays) rather than to recover from
180        errors. There's absolutely nothing wrong with a UTF-8 string
181        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
182        character with "&eacute;" will make it more readable to some
183        people.
184        """
185        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
186            cls._substitute_html_entity, s)
187
188
189class EncodingDetector:
190    """Suggests a number of possible encodings for a bytestring.
191
192    Order of precedence:
193
194    1. Encodings you specifically tell EncodingDetector to try first
195    (the override_encodings argument to the constructor).
196
197    2. An encoding declared within the bytestring itself, either in an
198    XML declaration (if the bytestring is to be interpreted as an XML
199    document), or in a <meta> tag (if the bytestring is to be
200    interpreted as an HTML document.)
201
202    3. An encoding detected through textual analysis by chardet,
203    cchardet, or a similar external library.
204
205    4. UTF-8.
206
207    5. Windows-1252.
208    """
209    def __init__(self, markup, override_encodings=None, is_html=False,
210                 exclude_encodings=None):
211        self.override_encodings = override_encodings or []
212        exclude_encodings = exclude_encodings or []
213        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
214        self.chardet_encoding = None
215        self.is_html = is_html
216        self.declared_encoding = None
217
218        # First order of business: strip a byte-order mark.
219        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
220
221    def _usable(self, encoding, tried):
222        if encoding is not None:
223            encoding = encoding.lower()
224            if encoding in self.exclude_encodings:
225                return False
226            if encoding not in tried:
227                tried.add(encoding)
228                return True
229        return False
230
231    @property
232    def encodings(self):
233        """Yield a number of encodings that might work for this markup."""
234        tried = set()
235        for e in self.override_encodings:
236            if self._usable(e, tried):
237                yield e
238
239        # Did the document originally start with a byte-order mark
240        # that indicated its encoding?
241        if self._usable(self.sniffed_encoding, tried):
242            yield self.sniffed_encoding
243
244        # Look within the document for an XML or HTML encoding
245        # declaration.
246        if self.declared_encoding is None:
247            self.declared_encoding = self.find_declared_encoding(
248                self.markup, self.is_html)
249        if self._usable(self.declared_encoding, tried):
250            yield self.declared_encoding
251
252        # Use third-party character set detection to guess at the
253        # encoding.
254        if self.chardet_encoding is None:
255            self.chardet_encoding = chardet_dammit(self.markup)
256        if self._usable(self.chardet_encoding, tried):
257            yield self.chardet_encoding
258
259        # As a last-ditch effort, try utf-8 and windows-1252.
260        for e in ('utf-8', 'windows-1252'):
261            if self._usable(e, tried):
262                yield e
263
264    @classmethod
265    def strip_byte_order_mark(cls, data):
266        """If a byte-order mark is present, strip it and return the encoding it implies."""
267        encoding = None
268        if isinstance(data, str):
269            # Unicode data cannot have a byte-order mark.
270            return data, encoding
271        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
272               and (data[2:4] != '\x00\x00'):
273            encoding = 'utf-16be'
274            data = data[2:]
275        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
276                 and (data[2:4] != '\x00\x00'):
277            encoding = 'utf-16le'
278            data = data[2:]
279        elif data[:3] == b'\xef\xbb\xbf':
280            encoding = 'utf-8'
281            data = data[3:]
282        elif data[:4] == b'\x00\x00\xfe\xff':
283            encoding = 'utf-32be'
284            data = data[4:]
285        elif data[:4] == b'\xff\xfe\x00\x00':
286            encoding = 'utf-32le'
287            data = data[4:]
288        return data, encoding
289
290    @classmethod
291    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
292        """Given a document, tries to find its declared encoding.
293
294        An XML encoding is declared at the beginning of the document.
295
296        An HTML encoding is declared in a <meta> tag, hopefully near the
297        beginning of the document.
298        """
299        if search_entire_document:
300            xml_endpos = html_endpos = len(markup)
301        else:
302            xml_endpos = 1024
303            html_endpos = max(2048, int(len(markup) * 0.05))
304
305        declared_encoding = None
306        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
307        if not declared_encoding_match and is_html:
308            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
309        if declared_encoding_match is not None:
310            declared_encoding = declared_encoding_match.groups()[0].decode(
311                'ascii', 'replace')
312        if declared_encoding:
313            return declared_encoding.lower()
314        return None
315
316class UnicodeDammit:
317    """A class for detecting the encoding of a *ML document and
318    converting it to a Unicode string. If the source encoding is
319    windows-1252, can replace MS smart quotes with their HTML or XML
320    equivalents."""
321
322    # This dictionary maps commonly seen values for "charset" in HTML
323    # meta tags to the corresponding Python codec names. It only covers
324    # values that aren't in Python's aliases and can't be determined
325    # by the heuristics in find_codec.
326    CHARSET_ALIASES = {"macintosh": "mac-roman",
327                       "x-sjis": "shift-jis"}
328
329    ENCODINGS_WITH_SMART_QUOTES = [
330        "windows-1252",
331        "iso-8859-1",
332        "iso-8859-2",
333        ]
334
335    def __init__(self, markup, override_encodings=[],
336                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
337        self.smart_quotes_to = smart_quotes_to
338        self.tried_encodings = []
339        self.contains_replacement_characters = False
340        self.is_html = is_html
341
342        self.detector = EncodingDetector(
343            markup, override_encodings, is_html, exclude_encodings)
344
345        # Short-circuit if the data is in Unicode to begin with.
346        if isinstance(markup, str) or markup == '':
347            self.markup = markup
348            self.unicode_markup = str(markup)
349            self.original_encoding = None
350            return
351
352        # The encoding detector may have stripped a byte-order mark.
353        # Use the stripped markup from this point on.
354        self.markup = self.detector.markup
355
356        u = None
357        for encoding in self.detector.encodings:
358            markup = self.detector.markup
359            u = self._convert_from(encoding)
360            if u is not None:
361                break
362
363        if not u:
364            # None of the encodings worked. As an absolute last resort,
365            # try them again with character replacement.
366
367            for encoding in self.detector.encodings:
368                if encoding != "ascii":
369                    u = self._convert_from(encoding, "replace")
370                if u is not None:
371                    logging.warning(
372                            "Some characters could not be decoded, and were "
373                            "replaced with REPLACEMENT CHARACTER.")
374                    self.contains_replacement_characters = True
375                    break
376
377        # If none of that worked, we could at this point force it to
378        # ASCII, but that would destroy so much data that I think
379        # giving up is better.
380        self.unicode_markup = u
381        if not u:
382            self.original_encoding = None
383
384    def _sub_ms_char(self, match):
385        """Changes a MS smart quote character to an XML or HTML
386        entity, or an ASCII character."""
387        orig = match.group(1)
388        if self.smart_quotes_to == 'ascii':
389            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
390        else:
391            sub = self.MS_CHARS.get(orig)
392            if type(sub) == tuple:
393                if self.smart_quotes_to == 'xml':
394                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
395                else:
396                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
397            else:
398                sub = sub.encode()
399        return sub
400
401    def _convert_from(self, proposed, errors="strict"):
402        proposed = self.find_codec(proposed)
403        if not proposed or (proposed, errors) in self.tried_encodings:
404            return None
405        self.tried_encodings.append((proposed, errors))
406        markup = self.markup
407        # Convert smart quotes to HTML if coming from an encoding
408        # that might have them.
409        if (self.smart_quotes_to is not None
410            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
411            smart_quotes_re = b"([\x80-\x9f])"
412            smart_quotes_compiled = re.compile(smart_quotes_re)
413            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
414
415        try:
416            #print "Trying to convert document to %s (errors=%s)" % (
417            #    proposed, errors)
418            u = self._to_unicode(markup, proposed, errors)
419            self.markup = u
420            self.original_encoding = proposed
421        except Exception as e:
422            #print "That didn't work!"
423            #print e
424            return None
425        #print "Correct encoding: %s" % proposed
426        return self.markup
427
428    def _to_unicode(self, data, encoding, errors="strict"):
429        '''Given a string and its encoding, decodes the string into Unicode.
430        %encoding is a string recognized by encodings.aliases'''
431        return str(data, encoding, errors)
432
433    @property
434    def declared_html_encoding(self):
435        if not self.is_html:
436            return None
437        return self.detector.declared_encoding
438
439    def find_codec(self, charset):
440        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
441               or (charset and self._codec(charset.replace("-", "")))
442               or (charset and self._codec(charset.replace("-", "_")))
443               or (charset and charset.lower())
444               or charset
445                )
446        if value:
447            return value.lower()
448        return None
449
450    def _codec(self, charset):
451        if not charset:
452            return charset
453        codec = None
454        try:
455            codecs.lookup(charset)
456            codec = charset
457        except (LookupError, ValueError):
458            pass
459        return codec
460
461
462    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
463    MS_CHARS = {b'\x80': ('euro', '20AC'),
464                b'\x81': ' ',
465                b'\x82': ('sbquo', '201A'),
466                b'\x83': ('fnof', '192'),
467                b'\x84': ('bdquo', '201E'),
468                b'\x85': ('hellip', '2026'),
469                b'\x86': ('dagger', '2020'),
470                b'\x87': ('Dagger', '2021'),
471                b'\x88': ('circ', '2C6'),
472                b'\x89': ('permil', '2030'),
473                b'\x8A': ('Scaron', '160'),
474                b'\x8B': ('lsaquo', '2039'),
475                b'\x8C': ('OElig', '152'),
476                b'\x8D': '?',
477                b'\x8E': ('#x17D', '17D'),
478                b'\x8F': '?',
479                b'\x90': '?',
480                b'\x91': ('lsquo', '2018'),
481                b'\x92': ('rsquo', '2019'),
482                b'\x93': ('ldquo', '201C'),
483                b'\x94': ('rdquo', '201D'),
484                b'\x95': ('bull', '2022'),
485                b'\x96': ('ndash', '2013'),
486                b'\x97': ('mdash', '2014'),
487                b'\x98': ('tilde', '2DC'),
488                b'\x99': ('trade', '2122'),
489                b'\x9a': ('scaron', '161'),
490                b'\x9b': ('rsaquo', '203A'),
491                b'\x9c': ('oelig', '153'),
492                b'\x9d': '?',
493                b'\x9e': ('#x17E', '17E'),
494                b'\x9f': ('Yuml', ''),}
495
496    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
497    # horrors like stripping diacritical marks to turn á into a, but also
498    # contains non-horrors like turning “ into ".
499    MS_CHARS_TO_ASCII = {
500        b'\x80' : 'EUR',
501        b'\x81' : ' ',
502        b'\x82' : ',',
503        b'\x83' : 'f',
504        b'\x84' : ',,',
505        b'\x85' : '...',
506        b'\x86' : '+',
507        b'\x87' : '++',
508        b'\x88' : '^',
509        b'\x89' : '%',
510        b'\x8a' : 'S',
511        b'\x8b' : '<',
512        b'\x8c' : 'OE',
513        b'\x8d' : '?',
514        b'\x8e' : 'Z',
515        b'\x8f' : '?',
516        b'\x90' : '?',
517        b'\x91' : "'",
518        b'\x92' : "'",
519        b'\x93' : '"',
520        b'\x94' : '"',
521        b'\x95' : '*',
522        b'\x96' : '-',
523        b'\x97' : '--',
524        b'\x98' : '~',
525        b'\x99' : '(TM)',
526        b'\x9a' : 's',
527        b'\x9b' : '>',
528        b'\x9c' : 'oe',
529        b'\x9d' : '?',
530        b'\x9e' : 'z',
531        b'\x9f' : 'Y',
532        b'\xa0' : ' ',
533        b'\xa1' : '!',
534        b'\xa2' : 'c',
535        b'\xa3' : 'GBP',
536        b'\xa4' : '$', #This approximation is especially parochial--this is the
537                       #generic currency symbol.
538        b'\xa5' : 'YEN',
539        b'\xa6' : '|',
540        b'\xa7' : 'S',
541        b'\xa8' : '..',
542        b'\xa9' : '',
543        b'\xaa' : '(th)',
544        b'\xab' : '<<',
545        b'\xac' : '!',
546        b'\xad' : ' ',
547        b'\xae' : '(R)',
548        b'\xaf' : '-',
549        b'\xb0' : 'o',
550        b'\xb1' : '+-',
551        b'\xb2' : '2',
552        b'\xb3' : '3',
553        b'\xb4' : ("'", 'acute'),
554        b'\xb5' : 'u',
555        b'\xb6' : 'P',
556        b'\xb7' : '*',
557        b'\xb8' : ',',
558        b'\xb9' : '1',
559        b'\xba' : '(th)',
560        b'\xbb' : '>>',
561        b'\xbc' : '1/4',
562        b'\xbd' : '1/2',
563        b'\xbe' : '3/4',
564        b'\xbf' : '?',
565        b'\xc0' : 'A',
566        b'\xc1' : 'A',
567        b'\xc2' : 'A',
568        b'\xc3' : 'A',
569        b'\xc4' : 'A',
570        b'\xc5' : 'A',
571        b'\xc6' : 'AE',
572        b'\xc7' : 'C',
573        b'\xc8' : 'E',
574        b'\xc9' : 'E',
575        b'\xca' : 'E',
576        b'\xcb' : 'E',
577        b'\xcc' : 'I',
578        b'\xcd' : 'I',
579        b'\xce' : 'I',
580        b'\xcf' : 'I',
581        b'\xd0' : 'D',
582        b'\xd1' : 'N',
583        b'\xd2' : 'O',
584        b'\xd3' : 'O',
585        b'\xd4' : 'O',
586        b'\xd5' : 'O',
587        b'\xd6' : 'O',
588        b'\xd7' : '*',
589        b'\xd8' : 'O',
590        b'\xd9' : 'U',
591        b'\xda' : 'U',
592        b'\xdb' : 'U',
593        b'\xdc' : 'U',
594        b'\xdd' : 'Y',
595        b'\xde' : 'b',
596        b'\xdf' : 'B',
597        b'\xe0' : 'a',
598        b'\xe1' : 'a',
599        b'\xe2' : 'a',
600        b'\xe3' : 'a',
601        b'\xe4' : 'a',
602        b'\xe5' : 'a',
603        b'\xe6' : 'ae',
604        b'\xe7' : 'c',
605        b'\xe8' : 'e',
606        b'\xe9' : 'e',
607        b'\xea' : 'e',
608        b'\xeb' : 'e',
609        b'\xec' : 'i',
610        b'\xed' : 'i',
611        b'\xee' : 'i',
612        b'\xef' : 'i',
613        b'\xf0' : 'o',
614        b'\xf1' : 'n',
615        b'\xf2' : 'o',
616        b'\xf3' : 'o',
617        b'\xf4' : 'o',
618        b'\xf5' : 'o',
619        b'\xf6' : 'o',
620        b'\xf7' : '/',
621        b'\xf8' : 'o',
622        b'\xf9' : 'u',
623        b'\xfa' : 'u',
624        b'\xfb' : 'u',
625        b'\xfc' : 'u',
626        b'\xfd' : 'y',
627        b'\xfe' : 'b',
628        b'\xff' : 'y',
629        }
630
631    # A map used when removing rogue Windows-1252/ISO-8859-1
632    # characters in otherwise UTF-8 documents.
633    #
634    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
635    # Windows-1252.
636    WINDOWS_1252_TO_UTF8 = {
637        0x80 : b'\xe2\x82\xac', # €
638        0x82 : b'\xe2\x80\x9a', # ‚
639        0x83 : b'\xc6\x92',     # ƒ
640        0x84 : b'\xe2\x80\x9e', # „
641        0x85 : b'\xe2\x80\xa6', # …
642        0x86 : b'\xe2\x80\xa0', # †
643        0x87 : b'\xe2\x80\xa1', # ‡
644        0x88 : b'\xcb\x86',     # ˆ
645        0x89 : b'\xe2\x80\xb0', # ‰
646        0x8a : b'\xc5\xa0',     # Š
647        0x8b : b'\xe2\x80\xb9', # ‹
648        0x8c : b'\xc5\x92',     # Œ
649        0x8e : b'\xc5\xbd',     # Ž
650        0x91 : b'\xe2\x80\x98', # ‘
651        0x92 : b'\xe2\x80\x99', # ’
652        0x93 : b'\xe2\x80\x9c', # “
653        0x94 : b'\xe2\x80\x9d', # ”
654        0x95 : b'\xe2\x80\xa2', # •
655        0x96 : b'\xe2\x80\x93', # –
656        0x97 : b'\xe2\x80\x94', # —
657        0x98 : b'\xcb\x9c',     # ˜
658        0x99 : b'\xe2\x84\xa2', # ™
659        0x9a : b'\xc5\xa1',     # š
660        0x9b : b'\xe2\x80\xba', # ›
661        0x9c : b'\xc5\x93',     # œ
662        0x9e : b'\xc5\xbe',     # ž
663        0x9f : b'\xc5\xb8',     # Ÿ
664        0xa0 : b'\xc2\xa0',     #  
665        0xa1 : b'\xc2\xa1',     # ¡
666        0xa2 : b'\xc2\xa2',     # ¢
667        0xa3 : b'\xc2\xa3',     # £
668        0xa4 : b'\xc2\xa4',     # ¤
669        0xa5 : b'\xc2\xa5',     # ¥
670        0xa6 : b'\xc2\xa6',     # ¦
671        0xa7 : b'\xc2\xa7',     # §
672        0xa8 : b'\xc2\xa8',     # ¨
673        0xa9 : b'\xc2\xa9',     # ©
674        0xaa : b'\xc2\xaa',     # ª
675        0xab : b'\xc2\xab',     # «
676        0xac : b'\xc2\xac',     # ¬
677        0xad : b'\xc2\xad',     # ­
678        0xae : b'\xc2\xae',     # ®
679        0xaf : b'\xc2\xaf',     # ¯
680        0xb0 : b'\xc2\xb0',     # °
681        0xb1 : b'\xc2\xb1',     # ±
682        0xb2 : b'\xc2\xb2',     # ²
683        0xb3 : b'\xc2\xb3',     # ³
684        0xb4 : b'\xc2\xb4',     # ´
685        0xb5 : b'\xc2\xb5',     # µ
686        0xb6 : b'\xc2\xb6',     # ¶
687        0xb7 : b'\xc2\xb7',     # ·
688        0xb8 : b'\xc2\xb8',     # ¸
689        0xb9 : b'\xc2\xb9',     # ¹
690        0xba : b'\xc2\xba',     # º
691        0xbb : b'\xc2\xbb',     # »
692        0xbc : b'\xc2\xbc',     # ¼
693        0xbd : b'\xc2\xbd',     # ½
694        0xbe : b'\xc2\xbe',     # ¾
695        0xbf : b'\xc2\xbf',     # ¿
696        0xc0 : b'\xc3\x80',     # À
697        0xc1 : b'\xc3\x81',     # Á
698        0xc2 : b'\xc3\x82',     # Â
699        0xc3 : b'\xc3\x83',     # Ã
700        0xc4 : b'\xc3\x84',     # Ä
701        0xc5 : b'\xc3\x85',     # Å
702        0xc6 : b'\xc3\x86',     # Æ
703        0xc7 : b'\xc3\x87',     # Ç
704        0xc8 : b'\xc3\x88',     # È
705        0xc9 : b'\xc3\x89',     # É
706        0xca : b'\xc3\x8a',     # Ê
707        0xcb : b'\xc3\x8b',     # Ë
708        0xcc : b'\xc3\x8c',     # Ì
709        0xcd : b'\xc3\x8d',     # Í
710        0xce : b'\xc3\x8e',     # Î
711        0xcf : b'\xc3\x8f',     # Ï
712        0xd0 : b'\xc3\x90',     # Ð
713        0xd1 : b'\xc3\x91',     # Ñ
714        0xd2 : b'\xc3\x92',     # Ò
715        0xd3 : b'\xc3\x93',     # Ó
716        0xd4 : b'\xc3\x94',     # Ô
717        0xd5 : b'\xc3\x95',     # Õ
718        0xd6 : b'\xc3\x96',     # Ö
719        0xd7 : b'\xc3\x97',     # ×
720        0xd8 : b'\xc3\x98',     # Ø
721        0xd9 : b'\xc3\x99',     # Ù
722        0xda : b'\xc3\x9a',     # Ú
723        0xdb : b'\xc3\x9b',     # Û
724        0xdc : b'\xc3\x9c',     # Ü
725        0xdd : b'\xc3\x9d',     # Ý
726        0xde : b'\xc3\x9e',     # Þ
727        0xdf : b'\xc3\x9f',     # ß
728        0xe0 : b'\xc3\xa0',     # à
729        0xe1 : b'\xa1',     # á
730        0xe2 : b'\xc3\xa2',     # â
731        0xe3 : b'\xc3\xa3',     # ã
732        0xe4 : b'\xc3\xa4',     # ä
733        0xe5 : b'\xc3\xa5',     # å
734        0xe6 : b'\xc3\xa6',     # æ
735        0xe7 : b'\xc3\xa7',     # ç
736        0xe8 : b'\xc3\xa8',     # è
737        0xe9 : b'\xc3\xa9',     # é
738        0xea : b'\xc3\xaa',     # ê
739        0xeb : b'\xc3\xab',     # ë
740        0xec : b'\xc3\xac',     # ì
741        0xed : b'\xc3\xad',     # í
742        0xee : b'\xc3\xae',     # î
743        0xef : b'\xc3\xaf',     # ï
744        0xf0 : b'\xc3\xb0',     # ð
745        0xf1 : b'\xc3\xb1',     # ñ
746        0xf2 : b'\xc3\xb2',     # ò
747        0xf3 : b'\xc3\xb3',     # ó
748        0xf4 : b'\xc3\xb4',     # ô
749        0xf5 : b'\xc3\xb5',     # õ
750        0xf6 : b'\xc3\xb6',     # ö
751        0xf7 : b'\xc3\xb7',     # ÷
752        0xf8 : b'\xc3\xb8',     # ø
753        0xf9 : b'\xc3\xb9',     # ù
754        0xfa : b'\xc3\xba',     # ú
755        0xfb : b'\xc3\xbb',     # û
756        0xfc : b'\xc3\xbc',     # ü
757        0xfd : b'\xc3\xbd',     # ý
758        0xfe : b'\xc3\xbe',     # þ
759        }
760
761    MULTIBYTE_MARKERS_AND_SIZES = [
762        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
763        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
764        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
765        ]
766
767    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
768    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
769
770    @classmethod
771    def detwingle(cls, in_bytes, main_encoding="utf8",
772                  embedded_encoding="windows-1252"):
773        """Fix characters from one encoding embedded in some other encoding.
774
775        Currently the only situation supported is Windows-1252 (or its
776        subset ISO-8859-1), embedded in UTF-8.
777
778        The input must be a bytestring. If you've already converted
779        the document to Unicode, you're too late.
780
781        The output is a bytestring in which `embedded_encoding`
782        characters have been converted to their `main_encoding`
783        equivalents.
784        """
785        if embedded_encoding.replace('_', '-').lower() not in (
786            'windows-1252', 'windows_1252'):
787            raise NotImplementedError(
788                "Windows-1252 and ISO-8859-1 are the only currently supported "
789                "embedded encodings.")
790
791        if main_encoding.lower() not in ('utf8', 'utf-8'):
792            raise NotImplementedError(
793                "UTF-8 is the only currently supported main encoding.")
794
795        byte_chunks = []
796
797        chunk_start = 0
798        pos = 0
799        while pos < len(in_bytes):
800            byte = in_bytes[pos]
801            if not isinstance(byte, int):
802                # Python 2.x
803                byte = ord(byte)
804            if (byte >= cls.FIRST_MULTIBYTE_MARKER
805                and byte <= cls.LAST_MULTIBYTE_MARKER):
806                # This is the start of a UTF-8 multibyte character. Skip
807                # to the end.
808                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
809                    if byte >= start and byte <= end:
810                        pos += size
811                        break
812            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
813                # We found a Windows-1252 character!
814                # Save the string up to this point as a chunk.
815                byte_chunks.append(in_bytes[chunk_start:pos])
816
817                # Now translate the Windows-1252 character into UTF-8
818                # and add it as another, one-byte chunk.
819                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
820                pos += 1
821                chunk_start = pos
822            else:
823                # Go on to the next character.
824                pos += 1
825        if chunk_start == 0:
826            # The string is unchanged.
827            return in_bytes
828        else:
829            # Store the final chunk.
830            byte_chunks.append(in_bytes[chunk_start:])
831        return b''.join(byte_chunks)
832
833