1eb8dc403SDave Cobbley# -*- coding: utf-8 -*- 2eb8dc403SDave Cobbley"""Tests of Beautiful Soup as a whole.""" 3eb8dc403SDave Cobbley 4eb8dc403SDave Cobbleyimport logging 5eb8dc403SDave Cobbleyimport unittest 6eb8dc403SDave Cobbleyimport sys 7eb8dc403SDave Cobbleyimport tempfile 8eb8dc403SDave Cobbley 9*82c905dcSAndrew Geisslerfrom bs4 import BeautifulSoup 10eb8dc403SDave Cobbleyfrom bs4.element import ( 11eb8dc403SDave Cobbley CharsetMetaAttributeValue, 12eb8dc403SDave Cobbley ContentMetaAttributeValue, 13eb8dc403SDave Cobbley SoupStrainer, 14eb8dc403SDave Cobbley NamespacedAttribute, 15eb8dc403SDave Cobbley ) 16eb8dc403SDave Cobbleyimport bs4.dammit 17eb8dc403SDave Cobbleyfrom bs4.dammit import ( 18eb8dc403SDave Cobbley EntitySubstitution, 19eb8dc403SDave Cobbley UnicodeDammit, 20eb8dc403SDave Cobbley EncodingDetector, 21eb8dc403SDave Cobbley) 22eb8dc403SDave Cobbleyfrom bs4.testing import ( 23eb8dc403SDave Cobbley SoupTest, 24eb8dc403SDave Cobbley skipIf, 25eb8dc403SDave Cobbley) 26eb8dc403SDave Cobbleyimport warnings 27eb8dc403SDave Cobbley 28eb8dc403SDave Cobbleytry: 29eb8dc403SDave Cobbley from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 30eb8dc403SDave Cobbley LXML_PRESENT = True 31eb8dc403SDave Cobbleyexcept ImportError as e: 32eb8dc403SDave Cobbley LXML_PRESENT = False 33eb8dc403SDave Cobbley 34eb8dc403SDave CobbleyPYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 35eb8dc403SDave CobbleyPYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) 36eb8dc403SDave Cobbley 37eb8dc403SDave Cobbleyclass TestConstructor(SoupTest): 38eb8dc403SDave Cobbley 39eb8dc403SDave Cobbley def test_short_unicode_input(self): 40eb8dc403SDave Cobbley data = "<h1>éé</h1>" 41eb8dc403SDave Cobbley soup = self.soup(data) 42eb8dc403SDave Cobbley self.assertEqual("éé", soup.h1.string) 43eb8dc403SDave Cobbley 44eb8dc403SDave Cobbley def test_embedded_null(self): 45eb8dc403SDave Cobbley data = "<h1>foo\0bar</h1>" 46eb8dc403SDave Cobbley soup = self.soup(data) 47eb8dc403SDave Cobbley self.assertEqual("foo\0bar", soup.h1.string) 48eb8dc403SDave Cobbley 49eb8dc403SDave Cobbley def test_exclude_encodings(self): 50eb8dc403SDave Cobbley utf8_data = "Räksmörgås".encode("utf-8") 51eb8dc403SDave Cobbley soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) 52eb8dc403SDave Cobbley self.assertEqual("windows-1252", soup.original_encoding) 53eb8dc403SDave Cobbley 54eb8dc403SDave Cobbley 55eb8dc403SDave Cobbleyclass TestWarnings(SoupTest): 56eb8dc403SDave Cobbley 57eb8dc403SDave Cobbley def _no_parser_specified(self, s, is_there=True): 58eb8dc403SDave Cobbley v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) 59eb8dc403SDave Cobbley self.assertTrue(v) 60eb8dc403SDave Cobbley 61eb8dc403SDave Cobbley def test_warning_if_no_parser_specified(self): 62eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 63eb8dc403SDave Cobbley soup = self.soup("<a><b></b></a>") 64eb8dc403SDave Cobbley msg = str(w[0].message) 65eb8dc403SDave Cobbley self._assert_no_parser_specified(msg) 66eb8dc403SDave Cobbley 67eb8dc403SDave Cobbley def test_warning_if_parser_specified_too_vague(self): 68eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 69eb8dc403SDave Cobbley soup = self.soup("<a><b></b></a>", "html") 70eb8dc403SDave Cobbley msg = str(w[0].message) 71eb8dc403SDave Cobbley self._assert_no_parser_specified(msg) 72eb8dc403SDave Cobbley 73eb8dc403SDave Cobbley def test_no_warning_if_explicit_parser_specified(self): 74eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 75eb8dc403SDave Cobbley soup = self.soup("<a><b></b></a>", "html.parser") 76eb8dc403SDave Cobbley self.assertEqual([], w) 77eb8dc403SDave Cobbley 78eb8dc403SDave Cobbley def test_parseOnlyThese_renamed_to_parse_only(self): 79eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 80eb8dc403SDave Cobbley soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) 81eb8dc403SDave Cobbley msg = str(w[0].message) 82eb8dc403SDave Cobbley self.assertTrue("parseOnlyThese" in msg) 83eb8dc403SDave Cobbley self.assertTrue("parse_only" in msg) 84eb8dc403SDave Cobbley self.assertEqual(b"<b></b>", soup.encode()) 85eb8dc403SDave Cobbley 86eb8dc403SDave Cobbley def test_fromEncoding_renamed_to_from_encoding(self): 87eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 88eb8dc403SDave Cobbley utf8 = b"\xc3\xa9" 89eb8dc403SDave Cobbley soup = self.soup(utf8, fromEncoding="utf8") 90eb8dc403SDave Cobbley msg = str(w[0].message) 91eb8dc403SDave Cobbley self.assertTrue("fromEncoding" in msg) 92eb8dc403SDave Cobbley self.assertTrue("from_encoding" in msg) 93eb8dc403SDave Cobbley self.assertEqual("utf8", soup.original_encoding) 94eb8dc403SDave Cobbley 95eb8dc403SDave Cobbley def test_unrecognized_keyword_argument(self): 96eb8dc403SDave Cobbley self.assertRaises( 97eb8dc403SDave Cobbley TypeError, self.soup, "<a>", no_such_argument=True) 98eb8dc403SDave Cobbley 99eb8dc403SDave Cobbleyclass TestWarnings(SoupTest): 100eb8dc403SDave Cobbley 101eb8dc403SDave Cobbley def test_disk_file_warning(self): 102eb8dc403SDave Cobbley filehandle = tempfile.NamedTemporaryFile() 103eb8dc403SDave Cobbley filename = filehandle.name 104eb8dc403SDave Cobbley try: 105eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 106eb8dc403SDave Cobbley soup = self.soup(filename) 107eb8dc403SDave Cobbley msg = str(w[0].message) 108eb8dc403SDave Cobbley self.assertTrue("looks like a filename" in msg) 109eb8dc403SDave Cobbley finally: 110eb8dc403SDave Cobbley filehandle.close() 111eb8dc403SDave Cobbley 112eb8dc403SDave Cobbley # The file no longer exists, so Beautiful Soup will no longer issue the warning. 113eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 114eb8dc403SDave Cobbley soup = self.soup(filename) 115eb8dc403SDave Cobbley self.assertEqual(0, len(w)) 116eb8dc403SDave Cobbley 117eb8dc403SDave Cobbley def test_url_warning(self): 118eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 119eb8dc403SDave Cobbley soup = self.soup("http://www.crummy.com/") 120eb8dc403SDave Cobbley msg = str(w[0].message) 121eb8dc403SDave Cobbley self.assertTrue("looks like a URL" in msg) 122eb8dc403SDave Cobbley 123eb8dc403SDave Cobbley with warnings.catch_warnings(record=True) as w: 124eb8dc403SDave Cobbley soup = self.soup("http://www.crummy.com/ is great") 125eb8dc403SDave Cobbley self.assertEqual(0, len(w)) 126eb8dc403SDave Cobbley 127eb8dc403SDave Cobbleyclass TestSelectiveParsing(SoupTest): 128eb8dc403SDave Cobbley 129eb8dc403SDave Cobbley def test_parse_with_soupstrainer(self): 130eb8dc403SDave Cobbley markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" 131eb8dc403SDave Cobbley strainer = SoupStrainer("b") 132eb8dc403SDave Cobbley soup = self.soup(markup, parse_only=strainer) 133eb8dc403SDave Cobbley self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") 134eb8dc403SDave Cobbley 135eb8dc403SDave Cobbley 136eb8dc403SDave Cobbleyclass TestEntitySubstitution(unittest.TestCase): 137eb8dc403SDave Cobbley """Standalone tests of the EntitySubstitution class.""" 138eb8dc403SDave Cobbley def setUp(self): 139eb8dc403SDave Cobbley self.sub = EntitySubstitution 140eb8dc403SDave Cobbley 141eb8dc403SDave Cobbley def test_simple_html_substitution(self): 142eb8dc403SDave Cobbley # Unicode characters corresponding to named HTML entites 143eb8dc403SDave Cobbley # are substituted, and no others. 144eb8dc403SDave Cobbley s = "foo\u2200\N{SNOWMAN}\u00f5bar" 145eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_html(s), 146eb8dc403SDave Cobbley "foo∀\N{SNOWMAN}õbar") 147eb8dc403SDave Cobbley 148eb8dc403SDave Cobbley def test_smart_quote_substitution(self): 149eb8dc403SDave Cobbley # MS smart quotes are a common source of frustration, so we 150eb8dc403SDave Cobbley # give them a special test. 151eb8dc403SDave Cobbley quotes = b"\x91\x92foo\x93\x94" 152eb8dc403SDave Cobbley dammit = UnicodeDammit(quotes) 153eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_html(dammit.markup), 154eb8dc403SDave Cobbley "‘’foo“”") 155eb8dc403SDave Cobbley 156eb8dc403SDave Cobbley def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): 157eb8dc403SDave Cobbley s = 'Welcome to "my bar"' 158eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_xml(s, False), s) 159eb8dc403SDave Cobbley 160eb8dc403SDave Cobbley def test_xml_attribute_quoting_normally_uses_double_quotes(self): 161eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_xml("Welcome", True), 162eb8dc403SDave Cobbley '"Welcome"') 163eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), 164eb8dc403SDave Cobbley '"Bob\'s Bar"') 165eb8dc403SDave Cobbley 166eb8dc403SDave Cobbley def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): 167eb8dc403SDave Cobbley s = 'Welcome to "my bar"' 168eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_xml(s, True), 169eb8dc403SDave Cobbley "'Welcome to \"my bar\"'") 170eb8dc403SDave Cobbley 171eb8dc403SDave Cobbley def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): 172eb8dc403SDave Cobbley s = 'Welcome to "Bob\'s Bar"' 173eb8dc403SDave Cobbley self.assertEqual( 174eb8dc403SDave Cobbley self.sub.substitute_xml(s, True), 175eb8dc403SDave Cobbley '"Welcome to "Bob\'s Bar""') 176eb8dc403SDave Cobbley 177eb8dc403SDave Cobbley def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): 178eb8dc403SDave Cobbley quoted = 'Welcome to "Bob\'s Bar"' 179eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_xml(quoted), quoted) 180eb8dc403SDave Cobbley 181eb8dc403SDave Cobbley def test_xml_quoting_handles_angle_brackets(self): 182eb8dc403SDave Cobbley self.assertEqual( 183eb8dc403SDave Cobbley self.sub.substitute_xml("foo<bar>"), 184eb8dc403SDave Cobbley "foo<bar>") 185eb8dc403SDave Cobbley 186eb8dc403SDave Cobbley def test_xml_quoting_handles_ampersands(self): 187eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") 188eb8dc403SDave Cobbley 189eb8dc403SDave Cobbley def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): 190eb8dc403SDave Cobbley self.assertEqual( 191eb8dc403SDave Cobbley self.sub.substitute_xml("ÁT&T"), 192eb8dc403SDave Cobbley "&Aacute;T&T") 193eb8dc403SDave Cobbley 194eb8dc403SDave Cobbley def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): 195eb8dc403SDave Cobbley self.assertEqual( 196eb8dc403SDave Cobbley self.sub.substitute_xml_containing_entities("ÁT&T"), 197eb8dc403SDave Cobbley "ÁT&T") 198eb8dc403SDave Cobbley 199eb8dc403SDave Cobbley def test_quotes_not_html_substituted(self): 200eb8dc403SDave Cobbley """There's no need to do this except inside attribute values.""" 201eb8dc403SDave Cobbley text = 'Bob\'s "bar"' 202eb8dc403SDave Cobbley self.assertEqual(self.sub.substitute_html(text), text) 203eb8dc403SDave Cobbley 204eb8dc403SDave Cobbley 205eb8dc403SDave Cobbleyclass TestEncodingConversion(SoupTest): 206eb8dc403SDave Cobbley # Test Beautiful Soup's ability to decode and encode from various 207eb8dc403SDave Cobbley # encodings. 208eb8dc403SDave Cobbley 209eb8dc403SDave Cobbley def setUp(self): 210eb8dc403SDave Cobbley super(TestEncodingConversion, self).setUp() 211eb8dc403SDave Cobbley self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 212eb8dc403SDave Cobbley self.utf8_data = self.unicode_data.encode("utf-8") 213eb8dc403SDave Cobbley # Just so you know what it looks like. 214eb8dc403SDave Cobbley self.assertEqual( 215eb8dc403SDave Cobbley self.utf8_data, 216eb8dc403SDave Cobbley b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') 217eb8dc403SDave Cobbley 218eb8dc403SDave Cobbley def test_ascii_in_unicode_out(self): 219eb8dc403SDave Cobbley # ASCII input is converted to Unicode. The original_encoding 220eb8dc403SDave Cobbley # attribute is set to 'utf-8', a superset of ASCII. 221eb8dc403SDave Cobbley chardet = bs4.dammit.chardet_dammit 222eb8dc403SDave Cobbley logging.disable(logging.WARNING) 223eb8dc403SDave Cobbley try: 224eb8dc403SDave Cobbley def noop(str): 225eb8dc403SDave Cobbley return None 226eb8dc403SDave Cobbley # Disable chardet, which will realize that the ASCII is ASCII. 227eb8dc403SDave Cobbley bs4.dammit.chardet_dammit = noop 228eb8dc403SDave Cobbley ascii = b"<foo>a</foo>" 229eb8dc403SDave Cobbley soup_from_ascii = self.soup(ascii) 230eb8dc403SDave Cobbley unicode_output = soup_from_ascii.decode() 231eb8dc403SDave Cobbley self.assertTrue(isinstance(unicode_output, str)) 232eb8dc403SDave Cobbley self.assertEqual(unicode_output, self.document_for(ascii.decode())) 233eb8dc403SDave Cobbley self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 234eb8dc403SDave Cobbley finally: 235eb8dc403SDave Cobbley logging.disable(logging.NOTSET) 236eb8dc403SDave Cobbley bs4.dammit.chardet_dammit = chardet 237eb8dc403SDave Cobbley 238eb8dc403SDave Cobbley def test_unicode_in_unicode_out(self): 239eb8dc403SDave Cobbley # Unicode input is left alone. The original_encoding attribute 240eb8dc403SDave Cobbley # is not set. 241eb8dc403SDave Cobbley soup_from_unicode = self.soup(self.unicode_data) 242eb8dc403SDave Cobbley self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 243eb8dc403SDave Cobbley self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') 244eb8dc403SDave Cobbley self.assertEqual(soup_from_unicode.original_encoding, None) 245eb8dc403SDave Cobbley 246eb8dc403SDave Cobbley def test_utf8_in_unicode_out(self): 247eb8dc403SDave Cobbley # UTF-8 input is converted to Unicode. The original_encoding 248eb8dc403SDave Cobbley # attribute is set. 249eb8dc403SDave Cobbley soup_from_utf8 = self.soup(self.utf8_data) 250eb8dc403SDave Cobbley self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 251eb8dc403SDave Cobbley self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') 252eb8dc403SDave Cobbley 253eb8dc403SDave Cobbley def test_utf8_out(self): 254eb8dc403SDave Cobbley # The internal data structures can be encoded as UTF-8. 255eb8dc403SDave Cobbley soup_from_unicode = self.soup(self.unicode_data) 256eb8dc403SDave Cobbley self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) 257eb8dc403SDave Cobbley 258eb8dc403SDave Cobbley @skipIf( 259eb8dc403SDave Cobbley PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 260eb8dc403SDave Cobbley "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 261eb8dc403SDave Cobbley def test_attribute_name_containing_unicode_characters(self): 262eb8dc403SDave Cobbley markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' 263eb8dc403SDave Cobbley self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 264eb8dc403SDave Cobbley 265eb8dc403SDave Cobbleyclass TestUnicodeDammit(unittest.TestCase): 266eb8dc403SDave Cobbley """Standalone tests of UnicodeDammit.""" 267eb8dc403SDave Cobbley 268eb8dc403SDave Cobbley def test_unicode_input(self): 269eb8dc403SDave Cobbley markup = "I'm already Unicode! \N{SNOWMAN}" 270eb8dc403SDave Cobbley dammit = UnicodeDammit(markup) 271eb8dc403SDave Cobbley self.assertEqual(dammit.unicode_markup, markup) 272eb8dc403SDave Cobbley 273eb8dc403SDave Cobbley def test_smart_quotes_to_unicode(self): 274eb8dc403SDave Cobbley markup = b"<foo>\x91\x92\x93\x94</foo>" 275eb8dc403SDave Cobbley dammit = UnicodeDammit(markup) 276eb8dc403SDave Cobbley self.assertEqual( 277eb8dc403SDave Cobbley dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") 278eb8dc403SDave Cobbley 279eb8dc403SDave Cobbley def test_smart_quotes_to_xml_entities(self): 280eb8dc403SDave Cobbley markup = b"<foo>\x91\x92\x93\x94</foo>" 281eb8dc403SDave Cobbley dammit = UnicodeDammit(markup, smart_quotes_to="xml") 282eb8dc403SDave Cobbley self.assertEqual( 283eb8dc403SDave Cobbley dammit.unicode_markup, "<foo>‘’“”</foo>") 284eb8dc403SDave Cobbley 285eb8dc403SDave Cobbley def test_smart_quotes_to_html_entities(self): 286eb8dc403SDave Cobbley markup = b"<foo>\x91\x92\x93\x94</foo>" 287eb8dc403SDave Cobbley dammit = UnicodeDammit(markup, smart_quotes_to="html") 288eb8dc403SDave Cobbley self.assertEqual( 289eb8dc403SDave Cobbley dammit.unicode_markup, "<foo>‘’“”</foo>") 290eb8dc403SDave Cobbley 291eb8dc403SDave Cobbley def test_smart_quotes_to_ascii(self): 292eb8dc403SDave Cobbley markup = b"<foo>\x91\x92\x93\x94</foo>" 293eb8dc403SDave Cobbley dammit = UnicodeDammit(markup, smart_quotes_to="ascii") 294eb8dc403SDave Cobbley self.assertEqual( 295eb8dc403SDave Cobbley dammit.unicode_markup, """<foo>''""</foo>""") 296eb8dc403SDave Cobbley 297eb8dc403SDave Cobbley def test_detect_utf8(self): 298eb8dc403SDave Cobbley utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" 299eb8dc403SDave Cobbley dammit = UnicodeDammit(utf8) 300eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 301eb8dc403SDave Cobbley self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') 302eb8dc403SDave Cobbley 303eb8dc403SDave Cobbley 304eb8dc403SDave Cobbley def test_convert_hebrew(self): 305eb8dc403SDave Cobbley hebrew = b"\xed\xe5\xec\xf9" 306eb8dc403SDave Cobbley dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 307eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 308eb8dc403SDave Cobbley self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') 309eb8dc403SDave Cobbley 310eb8dc403SDave Cobbley def test_dont_see_smart_quotes_where_there_are_none(self): 311eb8dc403SDave Cobbley utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 312eb8dc403SDave Cobbley dammit = UnicodeDammit(utf_8) 313eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 314eb8dc403SDave Cobbley self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 315eb8dc403SDave Cobbley 316eb8dc403SDave Cobbley def test_ignore_inappropriate_codecs(self): 317eb8dc403SDave Cobbley utf8_data = "Räksmörgås".encode("utf-8") 318eb8dc403SDave Cobbley dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 319eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 320eb8dc403SDave Cobbley 321eb8dc403SDave Cobbley def test_ignore_invalid_codecs(self): 322eb8dc403SDave Cobbley utf8_data = "Räksmörgås".encode("utf-8") 323eb8dc403SDave Cobbley for bad_encoding in ['.utf8', '...', 'utF---16.!']: 324eb8dc403SDave Cobbley dammit = UnicodeDammit(utf8_data, [bad_encoding]) 325eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 326eb8dc403SDave Cobbley 327eb8dc403SDave Cobbley def test_exclude_encodings(self): 328eb8dc403SDave Cobbley # This is UTF-8. 329eb8dc403SDave Cobbley utf8_data = "Räksmörgås".encode("utf-8") 330eb8dc403SDave Cobbley 331eb8dc403SDave Cobbley # But if we exclude UTF-8 from consideration, the guess is 332eb8dc403SDave Cobbley # Windows-1252. 333eb8dc403SDave Cobbley dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) 334eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') 335eb8dc403SDave Cobbley 336eb8dc403SDave Cobbley # And if we exclude that, there is no valid guess at all. 337eb8dc403SDave Cobbley dammit = UnicodeDammit( 338eb8dc403SDave Cobbley utf8_data, exclude_encodings=["utf-8", "windows-1252"]) 339eb8dc403SDave Cobbley self.assertEqual(dammit.original_encoding, None) 340eb8dc403SDave Cobbley 341eb8dc403SDave Cobbley def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): 342eb8dc403SDave Cobbley detected = EncodingDetector( 343eb8dc403SDave Cobbley b'<?xml version="1.0" encoding="UTF-\xdb" ?>') 344eb8dc403SDave Cobbley encodings = list(detected.encodings) 345eb8dc403SDave Cobbley assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings 346eb8dc403SDave Cobbley 347eb8dc403SDave Cobbley def test_detect_html5_style_meta_tag(self): 348eb8dc403SDave Cobbley 349eb8dc403SDave Cobbley for data in ( 350eb8dc403SDave Cobbley b'<html><meta charset="euc-jp" /></html>', 351eb8dc403SDave Cobbley b"<html><meta charset='euc-jp' /></html>", 352eb8dc403SDave Cobbley b"<html><meta charset=euc-jp /></html>", 353eb8dc403SDave Cobbley b"<html><meta charset=euc-jp/></html>"): 354eb8dc403SDave Cobbley dammit = UnicodeDammit(data, is_html=True) 355eb8dc403SDave Cobbley self.assertEqual( 356eb8dc403SDave Cobbley "euc-jp", dammit.original_encoding) 357eb8dc403SDave Cobbley 358eb8dc403SDave Cobbley def test_last_ditch_entity_replacement(self): 359eb8dc403SDave Cobbley # This is a UTF-8 document that contains bytestrings 360eb8dc403SDave Cobbley # completely incompatible with UTF-8 (ie. encoded with some other 361eb8dc403SDave Cobbley # encoding). 362eb8dc403SDave Cobbley # 363eb8dc403SDave Cobbley # Since there is no consistent encoding for the document, 364eb8dc403SDave Cobbley # Unicode, Dammit will eventually encode the document as UTF-8 365eb8dc403SDave Cobbley # and encode the incompatible characters as REPLACEMENT 366eb8dc403SDave Cobbley # CHARACTER. 367eb8dc403SDave Cobbley # 368eb8dc403SDave Cobbley # If chardet is installed, it will detect that the document 369eb8dc403SDave Cobbley # can be converted into ISO-8859-1 without errors. This happens 370eb8dc403SDave Cobbley # to be the wrong encoding, but it is a consistent encoding, so the 371eb8dc403SDave Cobbley # code we're testing here won't run. 372eb8dc403SDave Cobbley # 373eb8dc403SDave Cobbley # So we temporarily disable chardet if it's present. 374eb8dc403SDave Cobbley doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> 375eb8dc403SDave Cobbley<html><b>\330\250\330\252\330\261</b> 376eb8dc403SDave Cobbley<i>\310\322\321\220\312\321\355\344</i></html>""" 377eb8dc403SDave Cobbley chardet = bs4.dammit.chardet_dammit 378eb8dc403SDave Cobbley logging.disable(logging.WARNING) 379eb8dc403SDave Cobbley try: 380eb8dc403SDave Cobbley def noop(str): 381eb8dc403SDave Cobbley return None 382eb8dc403SDave Cobbley bs4.dammit.chardet_dammit = noop 383eb8dc403SDave Cobbley dammit = UnicodeDammit(doc) 384eb8dc403SDave Cobbley self.assertEqual(True, dammit.contains_replacement_characters) 385eb8dc403SDave Cobbley self.assertTrue("\ufffd" in dammit.unicode_markup) 386eb8dc403SDave Cobbley 387eb8dc403SDave Cobbley soup = BeautifulSoup(doc, "html.parser") 388eb8dc403SDave Cobbley self.assertTrue(soup.contains_replacement_characters) 389eb8dc403SDave Cobbley finally: 390eb8dc403SDave Cobbley logging.disable(logging.NOTSET) 391eb8dc403SDave Cobbley bs4.dammit.chardet_dammit = chardet 392eb8dc403SDave Cobbley 393eb8dc403SDave Cobbley def test_byte_order_mark_removed(self): 394eb8dc403SDave Cobbley # A document written in UTF-16LE will have its byte order marker stripped. 395eb8dc403SDave Cobbley data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 396eb8dc403SDave Cobbley dammit = UnicodeDammit(data) 397eb8dc403SDave Cobbley self.assertEqual("<a>áé</a>", dammit.unicode_markup) 398eb8dc403SDave Cobbley self.assertEqual("utf-16le", dammit.original_encoding) 399eb8dc403SDave Cobbley 400eb8dc403SDave Cobbley def test_detwingle(self): 401eb8dc403SDave Cobbley # Here's a UTF8 document. 402eb8dc403SDave Cobbley utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") 403eb8dc403SDave Cobbley 404eb8dc403SDave Cobbley # Here's a Windows-1252 document. 405eb8dc403SDave Cobbley windows_1252 = ( 406eb8dc403SDave Cobbley "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 407eb8dc403SDave Cobbley "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 408eb8dc403SDave Cobbley 409eb8dc403SDave Cobbley # Through some unholy alchemy, they've been stuck together. 410eb8dc403SDave Cobbley doc = utf8 + windows_1252 + utf8 411eb8dc403SDave Cobbley 412eb8dc403SDave Cobbley # The document can't be turned into UTF-8: 413eb8dc403SDave Cobbley self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") 414eb8dc403SDave Cobbley 415eb8dc403SDave Cobbley # Unicode, Dammit thinks the whole document is Windows-1252, 416eb8dc403SDave Cobbley # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" 417eb8dc403SDave Cobbley 418eb8dc403SDave Cobbley # But if we run it through fix_embedded_windows_1252, it's fixed: 419eb8dc403SDave Cobbley 420eb8dc403SDave Cobbley fixed = UnicodeDammit.detwingle(doc) 421eb8dc403SDave Cobbley self.assertEqual( 422eb8dc403SDave Cobbley "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 423eb8dc403SDave Cobbley 424eb8dc403SDave Cobbley def test_detwingle_ignores_multibyte_characters(self): 425eb8dc403SDave Cobbley # Each of these characters has a UTF-8 representation ending 426eb8dc403SDave Cobbley # in \x93. \x93 is a smart quote if interpreted as 427eb8dc403SDave Cobbley # Windows-1252. But our code knows to skip over multibyte 428eb8dc403SDave Cobbley # UTF-8 characters, so they'll survive the process unscathed. 429eb8dc403SDave Cobbley for tricky_unicode_char in ( 430eb8dc403SDave Cobbley "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 431eb8dc403SDave Cobbley "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 432eb8dc403SDave Cobbley "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 433eb8dc403SDave Cobbley ): 434eb8dc403SDave Cobbley input = tricky_unicode_char.encode("utf8") 435eb8dc403SDave Cobbley self.assertTrue(input.endswith(b'\x93')) 436eb8dc403SDave Cobbley output = UnicodeDammit.detwingle(input) 437eb8dc403SDave Cobbley self.assertEqual(output, input) 438eb8dc403SDave Cobbley 439eb8dc403SDave Cobbleyclass TestNamedspacedAttribute(SoupTest): 440eb8dc403SDave Cobbley 441eb8dc403SDave Cobbley def test_name_may_be_none(self): 442eb8dc403SDave Cobbley a = NamespacedAttribute("xmlns", None) 443eb8dc403SDave Cobbley self.assertEqual(a, "xmlns") 444eb8dc403SDave Cobbley 445eb8dc403SDave Cobbley def test_attribute_is_equivalent_to_colon_separated_string(self): 446eb8dc403SDave Cobbley a = NamespacedAttribute("a", "b") 447eb8dc403SDave Cobbley self.assertEqual("a:b", a) 448eb8dc403SDave Cobbley 449eb8dc403SDave Cobbley def test_attributes_are_equivalent_if_prefix_and_name_identical(self): 450eb8dc403SDave Cobbley a = NamespacedAttribute("a", "b", "c") 451eb8dc403SDave Cobbley b = NamespacedAttribute("a", "b", "c") 452eb8dc403SDave Cobbley self.assertEqual(a, b) 453eb8dc403SDave Cobbley 454eb8dc403SDave Cobbley # The actual namespace is not considered. 455eb8dc403SDave Cobbley c = NamespacedAttribute("a", "b", None) 456eb8dc403SDave Cobbley self.assertEqual(a, c) 457eb8dc403SDave Cobbley 458eb8dc403SDave Cobbley # But name and prefix are important. 459eb8dc403SDave Cobbley d = NamespacedAttribute("a", "z", "c") 460eb8dc403SDave Cobbley self.assertNotEqual(a, d) 461eb8dc403SDave Cobbley 462eb8dc403SDave Cobbley e = NamespacedAttribute("z", "b", "c") 463eb8dc403SDave Cobbley self.assertNotEqual(a, e) 464eb8dc403SDave Cobbley 465eb8dc403SDave Cobbley 466eb8dc403SDave Cobbleyclass TestAttributeValueWithCharsetSubstitution(unittest.TestCase): 467eb8dc403SDave Cobbley 468eb8dc403SDave Cobbley def test_content_meta_attribute_value(self): 469eb8dc403SDave Cobbley value = CharsetMetaAttributeValue("euc-jp") 470eb8dc403SDave Cobbley self.assertEqual("euc-jp", value) 471eb8dc403SDave Cobbley self.assertEqual("euc-jp", value.original_value) 472eb8dc403SDave Cobbley self.assertEqual("utf8", value.encode("utf8")) 473eb8dc403SDave Cobbley 474eb8dc403SDave Cobbley 475eb8dc403SDave Cobbley def test_content_meta_attribute_value(self): 476eb8dc403SDave Cobbley value = ContentMetaAttributeValue("text/html; charset=euc-jp") 477eb8dc403SDave Cobbley self.assertEqual("text/html; charset=euc-jp", value) 478eb8dc403SDave Cobbley self.assertEqual("text/html; charset=euc-jp", value.original_value) 479eb8dc403SDave Cobbley self.assertEqual("text/html; charset=utf8", value.encode("utf8")) 480