1# -*- coding: utf-8 -*- 2"""Tests of Beautiful Soup as a whole.""" 3 4import logging 5import unittest 6import sys 7import tempfile 8 9from bs4 import BeautifulSoup 10from bs4.element import ( 11 CharsetMetaAttributeValue, 12 ContentMetaAttributeValue, 13 SoupStrainer, 14 NamespacedAttribute, 15 ) 16import bs4.dammit 17from bs4.dammit import ( 18 EntitySubstitution, 19 UnicodeDammit, 20 EncodingDetector, 21) 22from bs4.testing import ( 23 SoupTest, 24 skipIf, 25) 26import warnings 27 28try: 29 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 30 LXML_PRESENT = True 31except ImportError as e: 32 LXML_PRESENT = False 33 34PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 35PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) 36 37class TestConstructor(SoupTest): 38 39 def test_short_unicode_input(self): 40 data = "<h1>éé</h1>" 41 soup = self.soup(data) 42 self.assertEqual("éé", soup.h1.string) 43 44 def test_embedded_null(self): 45 data = "<h1>foo\0bar</h1>" 46 soup = self.soup(data) 47 self.assertEqual("foo\0bar", soup.h1.string) 48 49 def test_exclude_encodings(self): 50 utf8_data = "Räksmörgås".encode("utf-8") 51 soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) 52 self.assertEqual("windows-1252", soup.original_encoding) 53 54 55class TestWarnings(SoupTest): 56 57 def _no_parser_specified(self, s, is_there=True): 58 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) 59 self.assertTrue(v) 60 61 def test_warning_if_no_parser_specified(self): 62 with warnings.catch_warnings(record=True) as w: 63 soup = self.soup("<a><b></b></a>") 64 msg = str(w[0].message) 65 self._assert_no_parser_specified(msg) 66 67 def test_warning_if_parser_specified_too_vague(self): 68 with warnings.catch_warnings(record=True) as w: 69 soup = self.soup("<a><b></b></a>", "html") 70 msg = str(w[0].message) 71 self._assert_no_parser_specified(msg) 72 73 def test_no_warning_if_explicit_parser_specified(self): 74 with warnings.catch_warnings(record=True) as w: 75 soup = self.soup("<a><b></b></a>", "html.parser") 76 self.assertEqual([], w) 77 78 def test_parseOnlyThese_renamed_to_parse_only(self): 79 with warnings.catch_warnings(record=True) as w: 80 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) 81 msg = str(w[0].message) 82 self.assertTrue("parseOnlyThese" in msg) 83 self.assertTrue("parse_only" in msg) 84 self.assertEqual(b"<b></b>", soup.encode()) 85 86 def test_fromEncoding_renamed_to_from_encoding(self): 87 with warnings.catch_warnings(record=True) as w: 88 utf8 = b"\xc3\xa9" 89 soup = self.soup(utf8, fromEncoding="utf8") 90 msg = str(w[0].message) 91 self.assertTrue("fromEncoding" in msg) 92 self.assertTrue("from_encoding" in msg) 93 self.assertEqual("utf8", soup.original_encoding) 94 95 def test_unrecognized_keyword_argument(self): 96 self.assertRaises( 97 TypeError, self.soup, "<a>", no_such_argument=True) 98 99class TestWarnings(SoupTest): 100 101 def test_disk_file_warning(self): 102 filehandle = tempfile.NamedTemporaryFile() 103 filename = filehandle.name 104 try: 105 with warnings.catch_warnings(record=True) as w: 106 soup = self.soup(filename) 107 msg = str(w[0].message) 108 self.assertTrue("looks like a filename" in msg) 109 finally: 110 filehandle.close() 111 112 # The file no longer exists, so Beautiful Soup will no longer issue the warning. 113 with warnings.catch_warnings(record=True) as w: 114 soup = self.soup(filename) 115 self.assertEqual(0, len(w)) 116 117 def test_url_warning(self): 118 with warnings.catch_warnings(record=True) as w: 119 soup = self.soup("http://www.crummy.com/") 120 msg = str(w[0].message) 121 self.assertTrue("looks like a URL" in msg) 122 123 with warnings.catch_warnings(record=True) as w: 124 soup = self.soup("http://www.crummy.com/ is great") 125 self.assertEqual(0, len(w)) 126 127class TestSelectiveParsing(SoupTest): 128 129 def test_parse_with_soupstrainer(self): 130 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" 131 strainer = SoupStrainer("b") 132 soup = self.soup(markup, parse_only=strainer) 133 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") 134 135 136class TestEntitySubstitution(unittest.TestCase): 137 """Standalone tests of the EntitySubstitution class.""" 138 def setUp(self): 139 self.sub = EntitySubstitution 140 141 def test_simple_html_substitution(self): 142 # Unicode characters corresponding to named HTML entites 143 # are substituted, and no others. 144 s = "foo\u2200\N{SNOWMAN}\u00f5bar" 145 self.assertEqual(self.sub.substitute_html(s), 146 "foo∀\N{SNOWMAN}õbar") 147 148 def test_smart_quote_substitution(self): 149 # MS smart quotes are a common source of frustration, so we 150 # give them a special test. 151 quotes = b"\x91\x92foo\x93\x94" 152 dammit = UnicodeDammit(quotes) 153 self.assertEqual(self.sub.substitute_html(dammit.markup), 154 "‘’foo“”") 155 156 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): 157 s = 'Welcome to "my bar"' 158 self.assertEqual(self.sub.substitute_xml(s, False), s) 159 160 def test_xml_attribute_quoting_normally_uses_double_quotes(self): 161 self.assertEqual(self.sub.substitute_xml("Welcome", True), 162 '"Welcome"') 163 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), 164 '"Bob\'s Bar"') 165 166 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): 167 s = 'Welcome to "my bar"' 168 self.assertEqual(self.sub.substitute_xml(s, True), 169 "'Welcome to \"my bar\"'") 170 171 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): 172 s = 'Welcome to "Bob\'s Bar"' 173 self.assertEqual( 174 self.sub.substitute_xml(s, True), 175 '"Welcome to "Bob\'s Bar""') 176 177 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): 178 quoted = 'Welcome to "Bob\'s Bar"' 179 self.assertEqual(self.sub.substitute_xml(quoted), quoted) 180 181 def test_xml_quoting_handles_angle_brackets(self): 182 self.assertEqual( 183 self.sub.substitute_xml("foo<bar>"), 184 "foo<bar>") 185 186 def test_xml_quoting_handles_ampersands(self): 187 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") 188 189 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): 190 self.assertEqual( 191 self.sub.substitute_xml("ÁT&T"), 192 "&Aacute;T&T") 193 194 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): 195 self.assertEqual( 196 self.sub.substitute_xml_containing_entities("ÁT&T"), 197 "ÁT&T") 198 199 def test_quotes_not_html_substituted(self): 200 """There's no need to do this except inside attribute values.""" 201 text = 'Bob\'s "bar"' 202 self.assertEqual(self.sub.substitute_html(text), text) 203 204 205class TestEncodingConversion(SoupTest): 206 # Test Beautiful Soup's ability to decode and encode from various 207 # encodings. 208 209 def setUp(self): 210 super(TestEncodingConversion, self).setUp() 211 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 212 self.utf8_data = self.unicode_data.encode("utf-8") 213 # Just so you know what it looks like. 214 self.assertEqual( 215 self.utf8_data, 216 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') 217 218 def test_ascii_in_unicode_out(self): 219 # ASCII input is converted to Unicode. The original_encoding 220 # attribute is set to 'utf-8', a superset of ASCII. 221 chardet = bs4.dammit.chardet_dammit 222 logging.disable(logging.WARNING) 223 try: 224 def noop(str): 225 return None 226 # Disable chardet, which will realize that the ASCII is ASCII. 227 bs4.dammit.chardet_dammit = noop 228 ascii = b"<foo>a</foo>" 229 soup_from_ascii = self.soup(ascii) 230 unicode_output = soup_from_ascii.decode() 231 self.assertTrue(isinstance(unicode_output, str)) 232 self.assertEqual(unicode_output, self.document_for(ascii.decode())) 233 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 234 finally: 235 logging.disable(logging.NOTSET) 236 bs4.dammit.chardet_dammit = chardet 237 238 def test_unicode_in_unicode_out(self): 239 # Unicode input is left alone. The original_encoding attribute 240 # is not set. 241 soup_from_unicode = self.soup(self.unicode_data) 242 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 243 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') 244 self.assertEqual(soup_from_unicode.original_encoding, None) 245 246 def test_utf8_in_unicode_out(self): 247 # UTF-8 input is converted to Unicode. The original_encoding 248 # attribute is set. 249 soup_from_utf8 = self.soup(self.utf8_data) 250 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 251 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') 252 253 def test_utf8_out(self): 254 # The internal data structures can be encoded as UTF-8. 255 soup_from_unicode = self.soup(self.unicode_data) 256 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) 257 258 @skipIf( 259 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 260 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 261 def test_attribute_name_containing_unicode_characters(self): 262 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' 263 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 264 265class TestUnicodeDammit(unittest.TestCase): 266 """Standalone tests of UnicodeDammit.""" 267 268 def test_unicode_input(self): 269 markup = "I'm already Unicode! \N{SNOWMAN}" 270 dammit = UnicodeDammit(markup) 271 self.assertEqual(dammit.unicode_markup, markup) 272 273 def test_smart_quotes_to_unicode(self): 274 markup = b"<foo>\x91\x92\x93\x94</foo>" 275 dammit = UnicodeDammit(markup) 276 self.assertEqual( 277 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") 278 279 def test_smart_quotes_to_xml_entities(self): 280 markup = b"<foo>\x91\x92\x93\x94</foo>" 281 dammit = UnicodeDammit(markup, smart_quotes_to="xml") 282 self.assertEqual( 283 dammit.unicode_markup, "<foo>‘’“”</foo>") 284 285 def test_smart_quotes_to_html_entities(self): 286 markup = b"<foo>\x91\x92\x93\x94</foo>" 287 dammit = UnicodeDammit(markup, smart_quotes_to="html") 288 self.assertEqual( 289 dammit.unicode_markup, "<foo>‘’“”</foo>") 290 291 def test_smart_quotes_to_ascii(self): 292 markup = b"<foo>\x91\x92\x93\x94</foo>" 293 dammit = UnicodeDammit(markup, smart_quotes_to="ascii") 294 self.assertEqual( 295 dammit.unicode_markup, """<foo>''""</foo>""") 296 297 def test_detect_utf8(self): 298 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" 299 dammit = UnicodeDammit(utf8) 300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 301 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') 302 303 304 def test_convert_hebrew(self): 305 hebrew = b"\xed\xe5\xec\xf9" 306 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 307 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 308 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') 309 310 def test_dont_see_smart_quotes_where_there_are_none(self): 311 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 312 dammit = UnicodeDammit(utf_8) 313 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 314 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 315 316 def test_ignore_inappropriate_codecs(self): 317 utf8_data = "Räksmörgås".encode("utf-8") 318 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 319 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 320 321 def test_ignore_invalid_codecs(self): 322 utf8_data = "Räksmörgås".encode("utf-8") 323 for bad_encoding in ['.utf8', '...', 'utF---16.!']: 324 dammit = UnicodeDammit(utf8_data, [bad_encoding]) 325 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 326 327 def test_exclude_encodings(self): 328 # This is UTF-8. 329 utf8_data = "Räksmörgås".encode("utf-8") 330 331 # But if we exclude UTF-8 from consideration, the guess is 332 # Windows-1252. 333 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) 334 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') 335 336 # And if we exclude that, there is no valid guess at all. 337 dammit = UnicodeDammit( 338 utf8_data, exclude_encodings=["utf-8", "windows-1252"]) 339 self.assertEqual(dammit.original_encoding, None) 340 341 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): 342 detected = EncodingDetector( 343 b'<?xml version="1.0" encoding="UTF-\xdb" ?>') 344 encodings = list(detected.encodings) 345 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings 346 347 def test_detect_html5_style_meta_tag(self): 348 349 for data in ( 350 b'<html><meta charset="euc-jp" /></html>', 351 b"<html><meta charset='euc-jp' /></html>", 352 b"<html><meta charset=euc-jp /></html>", 353 b"<html><meta charset=euc-jp/></html>"): 354 dammit = UnicodeDammit(data, is_html=True) 355 self.assertEqual( 356 "euc-jp", dammit.original_encoding) 357 358 def test_last_ditch_entity_replacement(self): 359 # This is a UTF-8 document that contains bytestrings 360 # completely incompatible with UTF-8 (ie. encoded with some other 361 # encoding). 362 # 363 # Since there is no consistent encoding for the document, 364 # Unicode, Dammit will eventually encode the document as UTF-8 365 # and encode the incompatible characters as REPLACEMENT 366 # CHARACTER. 367 # 368 # If chardet is installed, it will detect that the document 369 # can be converted into ISO-8859-1 without errors. This happens 370 # to be the wrong encoding, but it is a consistent encoding, so the 371 # code we're testing here won't run. 372 # 373 # So we temporarily disable chardet if it's present. 374 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> 375<html><b>\330\250\330\252\330\261</b> 376<i>\310\322\321\220\312\321\355\344</i></html>""" 377 chardet = bs4.dammit.chardet_dammit 378 logging.disable(logging.WARNING) 379 try: 380 def noop(str): 381 return None 382 bs4.dammit.chardet_dammit = noop 383 dammit = UnicodeDammit(doc) 384 self.assertEqual(True, dammit.contains_replacement_characters) 385 self.assertTrue("\ufffd" in dammit.unicode_markup) 386 387 soup = BeautifulSoup(doc, "html.parser") 388 self.assertTrue(soup.contains_replacement_characters) 389 finally: 390 logging.disable(logging.NOTSET) 391 bs4.dammit.chardet_dammit = chardet 392 393 def test_byte_order_mark_removed(self): 394 # A document written in UTF-16LE will have its byte order marker stripped. 395 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 396 dammit = UnicodeDammit(data) 397 self.assertEqual("<a>áé</a>", dammit.unicode_markup) 398 self.assertEqual("utf-16le", dammit.original_encoding) 399 400 def test_detwingle(self): 401 # Here's a UTF8 document. 402 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") 403 404 # Here's a Windows-1252 document. 405 windows_1252 = ( 406 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 407 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 408 409 # Through some unholy alchemy, they've been stuck together. 410 doc = utf8 + windows_1252 + utf8 411 412 # The document can't be turned into UTF-8: 413 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") 414 415 # Unicode, Dammit thinks the whole document is Windows-1252, 416 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" 417 418 # But if we run it through fix_embedded_windows_1252, it's fixed: 419 420 fixed = UnicodeDammit.detwingle(doc) 421 self.assertEqual( 422 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 423 424 def test_detwingle_ignores_multibyte_characters(self): 425 # Each of these characters has a UTF-8 representation ending 426 # in \x93. \x93 is a smart quote if interpreted as 427 # Windows-1252. But our code knows to skip over multibyte 428 # UTF-8 characters, so they'll survive the process unscathed. 429 for tricky_unicode_char in ( 430 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 431 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 432 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 433 ): 434 input = tricky_unicode_char.encode("utf8") 435 self.assertTrue(input.endswith(b'\x93')) 436 output = UnicodeDammit.detwingle(input) 437 self.assertEqual(output, input) 438 439class TestNamedspacedAttribute(SoupTest): 440 441 def test_name_may_be_none(self): 442 a = NamespacedAttribute("xmlns", None) 443 self.assertEqual(a, "xmlns") 444 445 def test_attribute_is_equivalent_to_colon_separated_string(self): 446 a = NamespacedAttribute("a", "b") 447 self.assertEqual("a:b", a) 448 449 def test_attributes_are_equivalent_if_prefix_and_name_identical(self): 450 a = NamespacedAttribute("a", "b", "c") 451 b = NamespacedAttribute("a", "b", "c") 452 self.assertEqual(a, b) 453 454 # The actual namespace is not considered. 455 c = NamespacedAttribute("a", "b", None) 456 self.assertEqual(a, c) 457 458 # But name and prefix are important. 459 d = NamespacedAttribute("a", "z", "c") 460 self.assertNotEqual(a, d) 461 462 e = NamespacedAttribute("z", "b", "c") 463 self.assertNotEqual(a, e) 464 465 466class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): 467 468 def test_content_meta_attribute_value(self): 469 value = CharsetMetaAttributeValue("euc-jp") 470 self.assertEqual("euc-jp", value) 471 self.assertEqual("euc-jp", value.original_value) 472 self.assertEqual("utf8", value.encode("utf8")) 473 474 475 def test_content_meta_attribute_value(self): 476 value = ContentMetaAttributeValue("text/html; charset=euc-jp") 477 self.assertEqual("text/html; charset=euc-jp", value) 478 self.assertEqual("text/html; charset=euc-jp", value.original_value) 479 self.assertEqual("text/html; charset=utf8", value.encode("utf8")) 480