1"""Helper classes for tests.""" 2 3__license__ = "MIT" 4 5import pickle 6import copy 7import unittest 8from unittest import TestCase 9from bs4 import BeautifulSoup 10from bs4.element import ( 11 CharsetMetaAttributeValue, 12 Comment, 13 ContentMetaAttributeValue, 14 Doctype, 15 SoupStrainer, 16) 17 18from bs4.builder._htmlparser import HTMLParserTreeBuilder 19default_builder = HTMLParserTreeBuilder 20 21 22class SoupTest(unittest.TestCase): 23 24 @property 25 def default_builder(self): 26 return default_builder() 27 28 def soup(self, markup, **kwargs): 29 """Build a Beautiful Soup object from markup.""" 30 builder = kwargs.pop('builder', self.default_builder) 31 return BeautifulSoup(markup, builder=builder, **kwargs) 32 33 def document_for(self, markup): 34 """Turn an HTML fragment into a document. 35 36 The details depend on the builder. 37 """ 38 return self.default_builder.test_fragment_to_document(markup) 39 40 def assertSoupEquals(self, to_parse, compare_parsed_to=None): 41 builder = self.default_builder 42 obj = BeautifulSoup(to_parse, builder=builder) 43 if compare_parsed_to is None: 44 compare_parsed_to = to_parse 45 46 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) 47 48 def assertConnectedness(self, element): 49 """Ensure that next_element and previous_element are properly 50 set for all descendants of the given element. 51 """ 52 earlier = None 53 for e in element.descendants: 54 if earlier: 55 self.assertEqual(e, earlier.next_element) 56 self.assertEqual(earlier, e.previous_element) 57 earlier = e 58 59class HTMLTreeBuilderSmokeTest(SoupTest): 60 61 """A basic test of a treebuilder's competence. 62 63 Any HTML treebuilder, present or future, should be able to pass 64 these tests. With invalid markup, there's room for interpretation, 65 and different parsers can handle it differently. But with the 66 markup in these tests, there's not much room for interpretation. 67 """ 68 69 def test_pickle_and_unpickle_identity(self): 70 # Pickling a tree, then unpickling it, yields a tree identical 71 # to the original. 72 tree = self.soup("<a><b>foo</a>") 73 dumped = pickle.dumps(tree, 2) 74 loaded = pickle.loads(dumped) 75 self.assertEqual(loaded.__class__, BeautifulSoup) 76 self.assertEqual(loaded.decode(), tree.decode()) 77 78 def assertDoctypeHandled(self, doctype_fragment): 79 """Assert that a given doctype string is handled correctly.""" 80 doctype_str, soup = self._document_with_doctype(doctype_fragment) 81 82 # Make sure a Doctype object was created. 83 doctype = soup.contents[0] 84 self.assertEqual(doctype.__class__, Doctype) 85 self.assertEqual(doctype, doctype_fragment) 86 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) 87 88 # Make sure that the doctype was correctly associated with the 89 # parse tree and that the rest of the document parsed. 90 self.assertEqual(soup.p.contents[0], 'foo') 91 92 def _document_with_doctype(self, doctype_fragment): 93 """Generate and parse a document with the given doctype.""" 94 doctype = '<!DOCTYPE %s>' % doctype_fragment 95 markup = doctype + '\n<p>foo</p>' 96 soup = self.soup(markup) 97 return doctype, soup 98 99 def test_normal_doctypes(self): 100 """Make sure normal, everyday HTML doctypes are handled correctly.""" 101 self.assertDoctypeHandled("html") 102 self.assertDoctypeHandled( 103 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') 104 105 def test_empty_doctype(self): 106 soup = self.soup("<!DOCTYPE>") 107 doctype = soup.contents[0] 108 self.assertEqual("", doctype.strip()) 109 110 def test_public_doctype_with_url(self): 111 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' 112 self.assertDoctypeHandled(doctype) 113 114 def test_system_doctype(self): 115 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') 116 117 def test_namespaced_system_doctype(self): 118 # We can handle a namespaced doctype with a system ID. 119 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') 120 121 def test_namespaced_public_doctype(self): 122 # Test a namespaced doctype with a public id. 123 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') 124 125 def test_real_xhtml_document(self): 126 """A real XHTML document should come out more or less the same as it went in.""" 127 markup = b"""<?xml version="1.0" encoding="utf-8"?> 128<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> 129<html xmlns="http://www.w3.org/1999/xhtml"> 130<head><title>Hello.</title></head> 131<body>Goodbye.</body> 132</html>""" 133 soup = self.soup(markup) 134 self.assertEqual( 135 soup.encode("utf-8").replace(b"\n", b""), 136 markup.replace(b"\n", b"")) 137 138 def test_processing_instruction(self): 139 markup = b"""<?PITarget PIContent?>""" 140 soup = self.soup(markup) 141 self.assertEqual(markup, soup.encode("utf8")) 142 143 def test_deepcopy(self): 144 """Make sure you can copy the tree builder. 145 146 This is important because the builder is part of a 147 BeautifulSoup object, and we want to be able to copy that. 148 """ 149 copy.deepcopy(self.default_builder) 150 151 def test_p_tag_is_never_empty_element(self): 152 """A <p> tag is never designated as an empty-element tag. 153 154 Even if the markup shows it as an empty-element tag, it 155 shouldn't be presented that way. 156 """ 157 soup = self.soup("<p/>") 158 self.assertFalse(soup.p.is_empty_element) 159 self.assertEqual(str(soup.p), "<p></p>") 160 161 def test_unclosed_tags_get_closed(self): 162 """A tag that's not closed by the end of the document should be closed. 163 164 This applies to all tags except empty-element tags. 165 """ 166 self.assertSoupEquals("<p>", "<p></p>") 167 self.assertSoupEquals("<b>", "<b></b>") 168 169 self.assertSoupEquals("<br>", "<br/>") 170 171 def test_br_is_always_empty_element_tag(self): 172 """A <br> tag is designated as an empty-element tag. 173 174 Some parsers treat <br></br> as one <br/> tag, some parsers as 175 two tags, but it should always be an empty-element tag. 176 """ 177 soup = self.soup("<br></br>") 178 self.assertTrue(soup.br.is_empty_element) 179 self.assertEqual(str(soup.br), "<br/>") 180 181 def test_nested_formatting_elements(self): 182 self.assertSoupEquals("<em><em></em></em>") 183 184 def test_double_head(self): 185 html = '''<!DOCTYPE html> 186<html> 187<head> 188<title>Ordinary HEAD element test</title> 189</head> 190<script type="text/javascript"> 191alert("Help!"); 192</script> 193<body> 194Hello, world! 195</body> 196</html> 197''' 198 soup = self.soup(html) 199 self.assertEqual("text/javascript", soup.find('script')['type']) 200 201 def test_comment(self): 202 # Comments are represented as Comment objects. 203 markup = "<p>foo<!--foobar-->baz</p>" 204 self.assertSoupEquals(markup) 205 206 soup = self.soup(markup) 207 comment = soup.find(text="foobar") 208 self.assertEqual(comment.__class__, Comment) 209 210 # The comment is properly integrated into the tree. 211 foo = soup.find(text="foo") 212 self.assertEqual(comment, foo.next_element) 213 baz = soup.find(text="baz") 214 self.assertEqual(comment, baz.previous_element) 215 216 def test_preserved_whitespace_in_pre_and_textarea(self): 217 """Whitespace must be preserved in <pre> and <textarea> tags.""" 218 self.assertSoupEquals("<pre> </pre>") 219 self.assertSoupEquals("<textarea> woo </textarea>") 220 221 def test_nested_inline_elements(self): 222 """Inline elements can be nested indefinitely.""" 223 b_tag = "<b>Inside a B tag</b>" 224 self.assertSoupEquals(b_tag) 225 226 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" 227 self.assertSoupEquals(nested_b_tag) 228 229 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" 230 self.assertSoupEquals(nested_b_tag) 231 232 def test_nested_block_level_elements(self): 233 """Block elements can be nested.""" 234 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') 235 blockquote = soup.blockquote 236 self.assertEqual(blockquote.p.b.string, 'Foo') 237 self.assertEqual(blockquote.b.string, 'Foo') 238 239 def test_correctly_nested_tables(self): 240 """One table can go inside another one.""" 241 markup = ('<table id="1">' 242 '<tr>' 243 "<td>Here's another table:" 244 '<table id="2">' 245 '<tr><td>foo</td></tr>' 246 '</table></td>') 247 248 self.assertSoupEquals( 249 markup, 250 '<table id="1"><tr><td>Here\'s another table:' 251 '<table id="2"><tr><td>foo</td></tr></table>' 252 '</td></tr></table>') 253 254 self.assertSoupEquals( 255 "<table><thead><tr><td>Foo</td></tr></thead>" 256 "<tbody><tr><td>Bar</td></tr></tbody>" 257 "<tfoot><tr><td>Baz</td></tr></tfoot></table>") 258 259 def test_deeply_nested_multivalued_attribute(self): 260 # html5lib can set the attributes of the same tag many times 261 # as it rearranges the tree. This has caused problems with 262 # multivalued attributes. 263 markup = '<table><div><div class="css"></div></div></table>' 264 soup = self.soup(markup) 265 self.assertEqual(["css"], soup.div.div['class']) 266 267 def test_multivalued_attribute_on_html(self): 268 # html5lib uses a different API to set the attributes ot the 269 # <html> tag. This has caused problems with multivalued 270 # attributes. 271 markup = '<html class="a b"></html>' 272 soup = self.soup(markup) 273 self.assertEqual(["a", "b"], soup.html['class']) 274 275 def test_angle_brackets_in_attribute_values_are_escaped(self): 276 self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') 277 278 def test_entities_in_attributes_converted_to_unicode(self): 279 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' 280 self.assertSoupEquals('<p id="piñata"></p>', expect) 281 self.assertSoupEquals('<p id="piñata"></p>', expect) 282 self.assertSoupEquals('<p id="piñata"></p>', expect) 283 self.assertSoupEquals('<p id="piñata"></p>', expect) 284 285 def test_entities_in_text_converted_to_unicode(self): 286 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' 287 self.assertSoupEquals("<p>piñata</p>", expect) 288 self.assertSoupEquals("<p>piñata</p>", expect) 289 self.assertSoupEquals("<p>piñata</p>", expect) 290 self.assertSoupEquals("<p>piñata</p>", expect) 291 292 def test_quot_entity_converted_to_quotation_mark(self): 293 self.assertSoupEquals("<p>I said "good day!"</p>", 294 '<p>I said "good day!"</p>') 295 296 def test_out_of_range_entity(self): 297 expect = "\N{REPLACEMENT CHARACTER}" 298 self.assertSoupEquals("�", expect) 299 self.assertSoupEquals("�", expect) 300 self.assertSoupEquals("�", expect) 301 302 def test_multipart_strings(self): 303 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." 304 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") 305 self.assertEqual("p", soup.h2.string.next_element.name) 306 self.assertEqual("p", soup.p.name) 307 self.assertConnectedness(soup) 308 309 def test_head_tag_between_head_and_body(self): 310 "Prevent recurrence of a bug in the html5lib treebuilder." 311 content = """<html><head></head> 312 <link></link> 313 <body>foo</body> 314</html> 315""" 316 soup = self.soup(content) 317 self.assertNotEqual(None, soup.html.body) 318 self.assertConnectedness(soup) 319 320 def test_multiple_copies_of_a_tag(self): 321 "Prevent recurrence of a bug in the html5lib treebuilder." 322 content = """<!DOCTYPE html> 323<html> 324 <body> 325 <article id="a" > 326 <div><a href="1"></div> 327 <footer> 328 <a href="2"></a> 329 </footer> 330 </article> 331 </body> 332</html> 333""" 334 soup = self.soup(content) 335 self.assertConnectedness(soup.article) 336 337 def test_basic_namespaces(self): 338 """Parsers don't need to *understand* namespaces, but at the 339 very least they should not choke on namespaces or lose 340 data.""" 341 342 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' 343 soup = self.soup(markup) 344 self.assertEqual(markup, soup.encode()) 345 html = soup.html 346 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) 347 self.assertEqual( 348 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) 349 self.assertEqual( 350 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) 351 352 def test_multivalued_attribute_value_becomes_list(self): 353 markup = b'<a class="foo bar">' 354 soup = self.soup(markup) 355 self.assertEqual(['foo', 'bar'], soup.a['class']) 356 357 # 358 # Generally speaking, tests below this point are more tests of 359 # Beautiful Soup than tests of the tree builders. But parsers are 360 # weird, so we run these tests separately for every tree builder 361 # to detect any differences between them. 362 # 363 364 def test_can_parse_unicode_document(self): 365 # A seemingly innocuous document... but it's in Unicode! And 366 # it contains characters that can't be represented in the 367 # encoding found in the declaration! The horror! 368 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' 369 soup = self.soup(markup) 370 self.assertEqual('Sacr\xe9 bleu!', soup.body.string) 371 372 def test_soupstrainer(self): 373 """Parsers should be able to work with SoupStrainers.""" 374 strainer = SoupStrainer("b") 375 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", 376 parse_only=strainer) 377 self.assertEqual(soup.decode(), "<b>bold</b>") 378 379 def test_single_quote_attribute_values_become_double_quotes(self): 380 self.assertSoupEquals("<foo attr='bar'></foo>", 381 '<foo attr="bar"></foo>') 382 383 def test_attribute_values_with_nested_quotes_are_left_alone(self): 384 text = """<foo attr='bar "brawls" happen'>a</foo>""" 385 self.assertSoupEquals(text) 386 387 def test_attribute_values_with_double_nested_quotes_get_quoted(self): 388 text = """<foo attr='bar "brawls" happen'>a</foo>""" 389 soup = self.soup(text) 390 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' 391 self.assertSoupEquals( 392 soup.foo.decode(), 393 """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") 394 395 def test_ampersand_in_attribute_value_gets_escaped(self): 396 self.assertSoupEquals('<this is="really messed up & stuff"></this>', 397 '<this is="really messed up & stuff"></this>') 398 399 self.assertSoupEquals( 400 '<a href="http://example.org?a=1&b=2;3">foo</a>', 401 '<a href="http://example.org?a=1&b=2;3">foo</a>') 402 403 def test_escaped_ampersand_in_attribute_value_is_left_alone(self): 404 self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') 405 406 def test_entities_in_strings_converted_during_parsing(self): 407 # Both XML and HTML entities are converted to Unicode characters 408 # during parsing. 409 text = "<p><<sacré bleu!>></p>" 410 expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" 411 self.assertSoupEquals(text, expected) 412 413 def test_smart_quotes_converted_on_the_way_in(self): 414 # Microsoft smart quotes are converted to Unicode characters during 415 # parsing. 416 quote = b"<p>\x91Foo\x92</p>" 417 soup = self.soup(quote) 418 self.assertEqual( 419 soup.p.string, 420 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") 421 422 def test_non_breaking_spaces_converted_on_the_way_in(self): 423 soup = self.soup("<a> </a>") 424 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) 425 426 def test_entities_converted_on_the_way_out(self): 427 text = "<p><<sacré bleu!>></p>" 428 expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") 429 soup = self.soup(text) 430 self.assertEqual(soup.p.encode("utf-8"), expected) 431 432 def test_real_iso_latin_document(self): 433 # Smoke test of interrelated functionality, using an 434 # easy-to-understand document. 435 436 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. 437 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' 438 439 # That's because we're going to encode it into ISO-Latin-1, and use 440 # that to test. 441 iso_latin_html = unicode_html.encode("iso-8859-1") 442 443 # Parse the ISO-Latin-1 HTML. 444 soup = self.soup(iso_latin_html) 445 # Encode it to UTF-8. 446 result = soup.encode("utf-8") 447 448 # What do we expect the result to look like? Well, it would 449 # look like unicode_html, except that the META tag would say 450 # UTF-8 instead of ISO-Latin-1. 451 expected = unicode_html.replace("ISO-Latin-1", "utf-8") 452 453 # And, of course, it would be in UTF-8, not Unicode. 454 expected = expected.encode("utf-8") 455 456 # Ta-da! 457 self.assertEqual(result, expected) 458 459 def test_real_shift_jis_document(self): 460 # Smoke test to make sure the parser can handle a document in 461 # Shift-JIS encoding, without choking. 462 shift_jis_html = ( 463 b'<html><head></head><body><pre>' 464 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' 465 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' 466 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' 467 b'</pre></body></html>') 468 unicode_html = shift_jis_html.decode("shift-jis") 469 soup = self.soup(unicode_html) 470 471 # Make sure the parse tree is correctly encoded to various 472 # encodings. 473 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) 474 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) 475 476 def test_real_hebrew_document(self): 477 # A real-world test to make sure we can convert ISO-8859-9 (a 478 # Hebrew encoding) to UTF-8. 479 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' 480 soup = self.soup( 481 hebrew_document, from_encoding="iso8859-8") 482 self.assertEqual(soup.original_encoding, 'iso8859-8') 483 self.assertEqual( 484 soup.encode('utf-8'), 485 hebrew_document.decode("iso8859-8").encode("utf-8")) 486 487 def test_meta_tag_reflects_current_encoding(self): 488 # Here's the <meta> tag saying that a document is 489 # encoded in Shift-JIS. 490 meta_tag = ('<meta content="text/html; charset=x-sjis" ' 491 'http-equiv="Content-type"/>') 492 493 # Here's a document incorporating that meta tag. 494 shift_jis_html = ( 495 '<html><head>\n%s\n' 496 '<meta http-equiv="Content-language" content="ja"/>' 497 '</head><body>Shift-JIS markup goes here.') % meta_tag 498 soup = self.soup(shift_jis_html) 499 500 # Parse the document, and the charset is seemingly unaffected. 501 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) 502 content = parsed_meta['content'] 503 self.assertEqual('text/html; charset=x-sjis', content) 504 505 # But that value is actually a ContentMetaAttributeValue object. 506 self.assertTrue(isinstance(content, ContentMetaAttributeValue)) 507 508 # And it will take on a value that reflects its current 509 # encoding. 510 self.assertEqual('text/html; charset=utf8', content.encode("utf8")) 511 512 # For the rest of the story, see TestSubstitutions in 513 # test_tree.py. 514 515 def test_html5_style_meta_tag_reflects_current_encoding(self): 516 # Here's the <meta> tag saying that a document is 517 # encoded in Shift-JIS. 518 meta_tag = ('<meta id="encoding" charset="x-sjis" />') 519 520 # Here's a document incorporating that meta tag. 521 shift_jis_html = ( 522 '<html><head>\n%s\n' 523 '<meta http-equiv="Content-language" content="ja"/>' 524 '</head><body>Shift-JIS markup goes here.') % meta_tag 525 soup = self.soup(shift_jis_html) 526 527 # Parse the document, and the charset is seemingly unaffected. 528 parsed_meta = soup.find('meta', id="encoding") 529 charset = parsed_meta['charset'] 530 self.assertEqual('x-sjis', charset) 531 532 # But that value is actually a CharsetMetaAttributeValue object. 533 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) 534 535 # And it will take on a value that reflects its current 536 # encoding. 537 self.assertEqual('utf8', charset.encode("utf8")) 538 539 def test_tag_with_no_attributes_can_have_attributes_added(self): 540 data = self.soup("<a>text</a>") 541 data.a['foo'] = 'bar' 542 self.assertEqual('<a foo="bar">text</a>', data.a.decode()) 543 544class XMLTreeBuilderSmokeTest(SoupTest): 545 546 def test_pickle_and_unpickle_identity(self): 547 # Pickling a tree, then unpickling it, yields a tree identical 548 # to the original. 549 tree = self.soup("<a><b>foo</a>") 550 dumped = pickle.dumps(tree, 2) 551 loaded = pickle.loads(dumped) 552 self.assertEqual(loaded.__class__, BeautifulSoup) 553 self.assertEqual(loaded.decode(), tree.decode()) 554 555 def test_docstring_generated(self): 556 soup = self.soup("<root/>") 557 self.assertEqual( 558 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') 559 560 def test_xml_declaration(self): 561 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>""" 562 soup = self.soup(markup) 563 self.assertEqual(markup, soup.encode("utf8")) 564 565 def test_real_xhtml_document(self): 566 """A real XHTML document should come out *exactly* the same as it went in.""" 567 markup = b"""<?xml version="1.0" encoding="utf-8"?> 568<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> 569<html xmlns="http://www.w3.org/1999/xhtml"> 570<head><title>Hello.</title></head> 571<body>Goodbye.</body> 572</html>""" 573 soup = self.soup(markup) 574 self.assertEqual( 575 soup.encode("utf-8"), markup) 576 577 def test_formatter_processes_script_tag_for_xml_documents(self): 578 doc = """ 579 <script type="text/javascript"> 580 </script> 581""" 582 soup = BeautifulSoup(doc, "lxml-xml") 583 # lxml would have stripped this while parsing, but we can add 584 # it later. 585 soup.script.string = 'console.log("< < hey > > ");' 586 encoded = soup.encode() 587 self.assertTrue(b"< < hey > >" in encoded) 588 589 def test_can_parse_unicode_document(self): 590 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' 591 soup = self.soup(markup) 592 self.assertEqual('Sacr\xe9 bleu!', soup.root.string) 593 594 def test_popping_namespaced_tag(self): 595 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' 596 soup = self.soup(markup) 597 self.assertEqual( 598 str(soup.rss), markup) 599 600 def test_docstring_includes_correct_encoding(self): 601 soup = self.soup("<root/>") 602 self.assertEqual( 603 soup.encode("latin1"), 604 b'<?xml version="1.0" encoding="latin1"?>\n<root/>') 605 606 def test_large_xml_document(self): 607 """A large XML document should come out the same as it went in.""" 608 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' 609 + b'0' * (2**12) 610 + b'</root>') 611 soup = self.soup(markup) 612 self.assertEqual(soup.encode("utf-8"), markup) 613 614 615 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): 616 self.assertSoupEquals("<p>", "<p/>") 617 self.assertSoupEquals("<p>foo</p>") 618 619 def test_namespaces_are_preserved(self): 620 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' 621 soup = self.soup(markup) 622 root = soup.root 623 self.assertEqual("http://example.com/", root['xmlns:a']) 624 self.assertEqual("http://example.net/", root['xmlns:b']) 625 626 def test_closing_namespaced_tag(self): 627 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' 628 soup = self.soup(markup) 629 self.assertEqual(str(soup.p), markup) 630 631 def test_namespaced_attributes(self): 632 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' 633 soup = self.soup(markup) 634 self.assertEqual(str(soup.foo), markup) 635 636 def test_namespaced_attributes_xml_namespace(self): 637 markup = '<foo xml:lang="fr">bar</foo>' 638 soup = self.soup(markup) 639 self.assertEqual(str(soup.foo), markup) 640 641class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): 642 """Smoke test for a tree builder that supports HTML5.""" 643 644 def test_real_xhtml_document(self): 645 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle 646 # XHTML documents in any particular way. 647 pass 648 649 def test_html_tags_have_namespace(self): 650 markup = "<a>" 651 soup = self.soup(markup) 652 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) 653 654 def test_svg_tags_have_namespace(self): 655 markup = '<svg><circle/></svg>' 656 soup = self.soup(markup) 657 namespace = "http://www.w3.org/2000/svg" 658 self.assertEqual(namespace, soup.svg.namespace) 659 self.assertEqual(namespace, soup.circle.namespace) 660 661 662 def test_mathml_tags_have_namespace(self): 663 markup = '<math><msqrt>5</msqrt></math>' 664 soup = self.soup(markup) 665 namespace = 'http://www.w3.org/1998/Math/MathML' 666 self.assertEqual(namespace, soup.math.namespace) 667 self.assertEqual(namespace, soup.msqrt.namespace) 668 669 def test_xml_declaration_becomes_comment(self): 670 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' 671 soup = self.soup(markup) 672 self.assertTrue(isinstance(soup.contents[0], Comment)) 673 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') 674 self.assertEqual("html", soup.contents[0].next_element.name) 675 676def skipIf(condition, reason): 677 def nothing(test, *args, **kwargs): 678 return None 679 680 def decorator(test_item): 681 if condition: 682 return nothing 683 else: 684 return test_item 685 686 return decorator 687