xref: /openbmc/openbmc/poky/bitbake/lib/bs4/testing.py (revision 4b740dc9)
1"""Helper classes for tests."""
2
3__license__ = "MIT"
4
5import pickle
6import copy
7import unittest
8from unittest import TestCase
9from bs4 import BeautifulSoup
10from bs4.element import (
11    CharsetMetaAttributeValue,
12    Comment,
13    ContentMetaAttributeValue,
14    Doctype,
15    SoupStrainer,
16)
17
18from bs4.builder._htmlparser import HTMLParserTreeBuilder
19default_builder = HTMLParserTreeBuilder
20
21
22class SoupTest(unittest.TestCase):
23
24    @property
25    def default_builder(self):
26        return default_builder()
27
28    def soup(self, markup, **kwargs):
29        """Build a Beautiful Soup object from markup."""
30        builder = kwargs.pop('builder', self.default_builder)
31        return BeautifulSoup(markup, builder=builder, **kwargs)
32
33    def document_for(self, markup):
34        """Turn an HTML fragment into a document.
35
36        The details depend on the builder.
37        """
38        return self.default_builder.test_fragment_to_document(markup)
39
40    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
41        builder = self.default_builder
42        obj = BeautifulSoup(to_parse, builder=builder)
43        if compare_parsed_to is None:
44            compare_parsed_to = to_parse
45
46        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
47
48    def assertConnectedness(self, element):
49        """Ensure that next_element and previous_element are properly
50        set for all descendants of the given element.
51        """
52        earlier = None
53        for e in element.descendants:
54            if earlier:
55                self.assertEqual(e, earlier.next_element)
56                self.assertEqual(earlier, e.previous_element)
57            earlier = e
58
59class HTMLTreeBuilderSmokeTest(SoupTest):
60
61    """A basic test of a treebuilder's competence.
62
63    Any HTML treebuilder, present or future, should be able to pass
64    these tests. With invalid markup, there's room for interpretation,
65    and different parsers can handle it differently. But with the
66    markup in these tests, there's not much room for interpretation.
67    """
68
69    def test_pickle_and_unpickle_identity(self):
70        # Pickling a tree, then unpickling it, yields a tree identical
71        # to the original.
72        tree = self.soup("<a><b>foo</a>")
73        dumped = pickle.dumps(tree, 2)
74        loaded = pickle.loads(dumped)
75        self.assertEqual(loaded.__class__, BeautifulSoup)
76        self.assertEqual(loaded.decode(), tree.decode())
77
78    def assertDoctypeHandled(self, doctype_fragment):
79        """Assert that a given doctype string is handled correctly."""
80        doctype_str, soup = self._document_with_doctype(doctype_fragment)
81
82        # Make sure a Doctype object was created.
83        doctype = soup.contents[0]
84        self.assertEqual(doctype.__class__, Doctype)
85        self.assertEqual(doctype, doctype_fragment)
86        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
87
88        # Make sure that the doctype was correctly associated with the
89        # parse tree and that the rest of the document parsed.
90        self.assertEqual(soup.p.contents[0], 'foo')
91
92    def _document_with_doctype(self, doctype_fragment):
93        """Generate and parse a document with the given doctype."""
94        doctype = '<!DOCTYPE %s>' % doctype_fragment
95        markup = doctype + '\n<p>foo</p>'
96        soup = self.soup(markup)
97        return doctype, soup
98
99    def test_normal_doctypes(self):
100        """Make sure normal, everyday HTML doctypes are handled correctly."""
101        self.assertDoctypeHandled("html")
102        self.assertDoctypeHandled(
103            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
104
105    def test_empty_doctype(self):
106        soup = self.soup("<!DOCTYPE>")
107        doctype = soup.contents[0]
108        self.assertEqual("", doctype.strip())
109
110    def test_public_doctype_with_url(self):
111        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
112        self.assertDoctypeHandled(doctype)
113
114    def test_system_doctype(self):
115        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
116
117    def test_namespaced_system_doctype(self):
118        # We can handle a namespaced doctype with a system ID.
119        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
120
121    def test_namespaced_public_doctype(self):
122        # Test a namespaced doctype with a public id.
123        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
124
125    def test_real_xhtml_document(self):
126        """A real XHTML document should come out more or less the same as it went in."""
127        markup = b"""<?xml version="1.0" encoding="utf-8"?>
128<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
129<html xmlns="http://www.w3.org/1999/xhtml">
130<head><title>Hello.</title></head>
131<body>Goodbye.</body>
132</html>"""
133        soup = self.soup(markup)
134        self.assertEqual(
135            soup.encode("utf-8").replace(b"\n", b""),
136            markup.replace(b"\n", b""))
137
138    def test_processing_instruction(self):
139        markup = b"""<?PITarget PIContent?>"""
140        soup = self.soup(markup)
141        self.assertEqual(markup, soup.encode("utf8"))
142
143    def test_deepcopy(self):
144        """Make sure you can copy the tree builder.
145
146        This is important because the builder is part of a
147        BeautifulSoup object, and we want to be able to copy that.
148        """
149        copy.deepcopy(self.default_builder)
150
151    def test_p_tag_is_never_empty_element(self):
152        """A <p> tag is never designated as an empty-element tag.
153
154        Even if the markup shows it as an empty-element tag, it
155        shouldn't be presented that way.
156        """
157        soup = self.soup("<p/>")
158        self.assertFalse(soup.p.is_empty_element)
159        self.assertEqual(str(soup.p), "<p></p>")
160
161    def test_unclosed_tags_get_closed(self):
162        """A tag that's not closed by the end of the document should be closed.
163
164        This applies to all tags except empty-element tags.
165        """
166        self.assertSoupEquals("<p>", "<p></p>")
167        self.assertSoupEquals("<b>", "<b></b>")
168
169        self.assertSoupEquals("<br>", "<br/>")
170
171    def test_br_is_always_empty_element_tag(self):
172        """A <br> tag is designated as an empty-element tag.
173
174        Some parsers treat <br></br> as one <br/> tag, some parsers as
175        two tags, but it should always be an empty-element tag.
176        """
177        soup = self.soup("<br></br>")
178        self.assertTrue(soup.br.is_empty_element)
179        self.assertEqual(str(soup.br), "<br/>")
180
181    def test_nested_formatting_elements(self):
182        self.assertSoupEquals("<em><em></em></em>")
183
184    def test_double_head(self):
185        html = '''<!DOCTYPE html>
186<html>
187<head>
188<title>Ordinary HEAD element test</title>
189</head>
190<script type="text/javascript">
191alert("Help!");
192</script>
193<body>
194Hello, world!
195</body>
196</html>
197'''
198        soup = self.soup(html)
199        self.assertEqual("text/javascript", soup.find('script')['type'])
200
201    def test_comment(self):
202        # Comments are represented as Comment objects.
203        markup = "<p>foo<!--foobar-->baz</p>"
204        self.assertSoupEquals(markup)
205
206        soup = self.soup(markup)
207        comment = soup.find(text="foobar")
208        self.assertEqual(comment.__class__, Comment)
209
210        # The comment is properly integrated into the tree.
211        foo = soup.find(text="foo")
212        self.assertEqual(comment, foo.next_element)
213        baz = soup.find(text="baz")
214        self.assertEqual(comment, baz.previous_element)
215
216    def test_preserved_whitespace_in_pre_and_textarea(self):
217        """Whitespace must be preserved in <pre> and <textarea> tags."""
218        self.assertSoupEquals("<pre>   </pre>")
219        self.assertSoupEquals("<textarea> woo  </textarea>")
220
221    def test_nested_inline_elements(self):
222        """Inline elements can be nested indefinitely."""
223        b_tag = "<b>Inside a B tag</b>"
224        self.assertSoupEquals(b_tag)
225
226        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
227        self.assertSoupEquals(nested_b_tag)
228
229        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
230        self.assertSoupEquals(nested_b_tag)
231
232    def test_nested_block_level_elements(self):
233        """Block elements can be nested."""
234        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
235        blockquote = soup.blockquote
236        self.assertEqual(blockquote.p.b.string, 'Foo')
237        self.assertEqual(blockquote.b.string, 'Foo')
238
239    def test_correctly_nested_tables(self):
240        """One table can go inside another one."""
241        markup = ('<table id="1">'
242                  '<tr>'
243                  "<td>Here's another table:"
244                  '<table id="2">'
245                  '<tr><td>foo</td></tr>'
246                  '</table></td>')
247
248        self.assertSoupEquals(
249            markup,
250            '<table id="1"><tr><td>Here\'s another table:'
251            '<table id="2"><tr><td>foo</td></tr></table>'
252            '</td></tr></table>')
253
254        self.assertSoupEquals(
255            "<table><thead><tr><td>Foo</td></tr></thead>"
256            "<tbody><tr><td>Bar</td></tr></tbody>"
257            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
258
259    def test_deeply_nested_multivalued_attribute(self):
260        # html5lib can set the attributes of the same tag many times
261        # as it rearranges the tree. This has caused problems with
262        # multivalued attributes.
263        markup = '<table><div><div class="css"></div></div></table>'
264        soup = self.soup(markup)
265        self.assertEqual(["css"], soup.div.div['class'])
266
267    def test_multivalued_attribute_on_html(self):
268        # html5lib uses a different API to set the attributes ot the
269        # <html> tag. This has caused problems with multivalued
270        # attributes.
271        markup = '<html class="a b"></html>'
272        soup = self.soup(markup)
273        self.assertEqual(["a", "b"], soup.html['class'])
274
275    def test_angle_brackets_in_attribute_values_are_escaped(self):
276        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
277
278    def test_entities_in_attributes_converted_to_unicode(self):
279        expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
280        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
281        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
282        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
283        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
284
285    def test_entities_in_text_converted_to_unicode(self):
286        expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
287        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
288        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
289        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
290        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
291
292    def test_quot_entity_converted_to_quotation_mark(self):
293        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
294                              '<p>I said "good day!"</p>')
295
296    def test_out_of_range_entity(self):
297        expect = "\N{REPLACEMENT CHARACTER}"
298        self.assertSoupEquals("&#10000000000000;", expect)
299        self.assertSoupEquals("&#x10000000000000;", expect)
300        self.assertSoupEquals("&#1000000000;", expect)
301
302    def test_multipart_strings(self):
303        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
304        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
305        self.assertEqual("p", soup.h2.string.next_element.name)
306        self.assertEqual("p", soup.p.name)
307        self.assertConnectedness(soup)
308
309    def test_head_tag_between_head_and_body(self):
310        "Prevent recurrence of a bug in the html5lib treebuilder."
311        content = """<html><head></head>
312  <link></link>
313  <body>foo</body>
314</html>
315"""
316        soup = self.soup(content)
317        self.assertNotEqual(None, soup.html.body)
318        self.assertConnectedness(soup)
319
320    def test_multiple_copies_of_a_tag(self):
321        "Prevent recurrence of a bug in the html5lib treebuilder."
322        content = """<!DOCTYPE html>
323<html>
324 <body>
325   <article id="a" >
326   <div><a href="1"></div>
327   <footer>
328     <a href="2"></a>
329   </footer>
330  </article>
331  </body>
332</html>
333"""
334        soup = self.soup(content)
335        self.assertConnectedness(soup.article)
336
337    def test_basic_namespaces(self):
338        """Parsers don't need to *understand* namespaces, but at the
339        very least they should not choke on namespaces or lose
340        data."""
341
342        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
343        soup = self.soup(markup)
344        self.assertEqual(markup, soup.encode())
345        html = soup.html
346        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
347        self.assertEqual(
348            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
349        self.assertEqual(
350            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
351
352    def test_multivalued_attribute_value_becomes_list(self):
353        markup = b'<a class="foo bar">'
354        soup = self.soup(markup)
355        self.assertEqual(['foo', 'bar'], soup.a['class'])
356
357    #
358    # Generally speaking, tests below this point are more tests of
359    # Beautiful Soup than tests of the tree builders. But parsers are
360    # weird, so we run these tests separately for every tree builder
361    # to detect any differences between them.
362    #
363
364    def test_can_parse_unicode_document(self):
365        # A seemingly innocuous document... but it's in Unicode! And
366        # it contains characters that can't be represented in the
367        # encoding found in the  declaration! The horror!
368        markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
369        soup = self.soup(markup)
370        self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
371
372    def test_soupstrainer(self):
373        """Parsers should be able to work with SoupStrainers."""
374        strainer = SoupStrainer("b")
375        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
376                         parse_only=strainer)
377        self.assertEqual(soup.decode(), "<b>bold</b>")
378
379    def test_single_quote_attribute_values_become_double_quotes(self):
380        self.assertSoupEquals("<foo attr='bar'></foo>",
381                              '<foo attr="bar"></foo>')
382
383    def test_attribute_values_with_nested_quotes_are_left_alone(self):
384        text = """<foo attr='bar "brawls" happen'>a</foo>"""
385        self.assertSoupEquals(text)
386
387    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
388        text = """<foo attr='bar "brawls" happen'>a</foo>"""
389        soup = self.soup(text)
390        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
391        self.assertSoupEquals(
392            soup.foo.decode(),
393            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
394
395    def test_ampersand_in_attribute_value_gets_escaped(self):
396        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
397                              '<this is="really messed up &amp; stuff"></this>')
398
399        self.assertSoupEquals(
400            '<a href="http://example.org?a=1&b=2;3">foo</a>',
401            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
402
403    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
404        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
405
406    def test_entities_in_strings_converted_during_parsing(self):
407        # Both XML and HTML entities are converted to Unicode characters
408        # during parsing.
409        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
410        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
411        self.assertSoupEquals(text, expected)
412
413    def test_smart_quotes_converted_on_the_way_in(self):
414        # Microsoft smart quotes are converted to Unicode characters during
415        # parsing.
416        quote = b"<p>\x91Foo\x92</p>"
417        soup = self.soup(quote)
418        self.assertEqual(
419            soup.p.string,
420            "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
421
422    def test_non_breaking_spaces_converted_on_the_way_in(self):
423        soup = self.soup("<a>&nbsp;&nbsp;</a>")
424        self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
425
426    def test_entities_converted_on_the_way_out(self):
427        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
428        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
429        soup = self.soup(text)
430        self.assertEqual(soup.p.encode("utf-8"), expected)
431
432    def test_real_iso_latin_document(self):
433        # Smoke test of interrelated functionality, using an
434        # easy-to-understand document.
435
436        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
437        unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
438
439        # That's because we're going to encode it into ISO-Latin-1, and use
440        # that to test.
441        iso_latin_html = unicode_html.encode("iso-8859-1")
442
443        # Parse the ISO-Latin-1 HTML.
444        soup = self.soup(iso_latin_html)
445        # Encode it to UTF-8.
446        result = soup.encode("utf-8")
447
448        # What do we expect the result to look like? Well, it would
449        # look like unicode_html, except that the META tag would say
450        # UTF-8 instead of ISO-Latin-1.
451        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
452
453        # And, of course, it would be in UTF-8, not Unicode.
454        expected = expected.encode("utf-8")
455
456        # Ta-da!
457        self.assertEqual(result, expected)
458
459    def test_real_shift_jis_document(self):
460        # Smoke test to make sure the parser can handle a document in
461        # Shift-JIS encoding, without choking.
462        shift_jis_html = (
463            b'<html><head></head><body><pre>'
464            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
465            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
466            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
467            b'</pre></body></html>')
468        unicode_html = shift_jis_html.decode("shift-jis")
469        soup = self.soup(unicode_html)
470
471        # Make sure the parse tree is correctly encoded to various
472        # encodings.
473        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
474        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
475
476    def test_real_hebrew_document(self):
477        # A real-world test to make sure we can convert ISO-8859-9 (a
478        # Hebrew encoding) to UTF-8.
479        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
480        soup = self.soup(
481            hebrew_document, from_encoding="iso8859-8")
482        self.assertEqual(soup.original_encoding, 'iso8859-8')
483        self.assertEqual(
484            soup.encode('utf-8'),
485            hebrew_document.decode("iso8859-8").encode("utf-8"))
486
487    def test_meta_tag_reflects_current_encoding(self):
488        # Here's the <meta> tag saying that a document is
489        # encoded in Shift-JIS.
490        meta_tag = ('<meta content="text/html; charset=x-sjis" '
491                    'http-equiv="Content-type"/>')
492
493        # Here's a document incorporating that meta tag.
494        shift_jis_html = (
495            '<html><head>\n%s\n'
496            '<meta http-equiv="Content-language" content="ja"/>'
497            '</head><body>Shift-JIS markup goes here.') % meta_tag
498        soup = self.soup(shift_jis_html)
499
500        # Parse the document, and the charset is seemingly unaffected.
501        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
502        content = parsed_meta['content']
503        self.assertEqual('text/html; charset=x-sjis', content)
504
505        # But that value is actually a ContentMetaAttributeValue object.
506        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
507
508        # And it will take on a value that reflects its current
509        # encoding.
510        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
511
512        # For the rest of the story, see TestSubstitutions in
513        # test_tree.py.
514
515    def test_html5_style_meta_tag_reflects_current_encoding(self):
516        # Here's the <meta> tag saying that a document is
517        # encoded in Shift-JIS.
518        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
519
520        # Here's a document incorporating that meta tag.
521        shift_jis_html = (
522            '<html><head>\n%s\n'
523            '<meta http-equiv="Content-language" content="ja"/>'
524            '</head><body>Shift-JIS markup goes here.') % meta_tag
525        soup = self.soup(shift_jis_html)
526
527        # Parse the document, and the charset is seemingly unaffected.
528        parsed_meta = soup.find('meta', id="encoding")
529        charset = parsed_meta['charset']
530        self.assertEqual('x-sjis', charset)
531
532        # But that value is actually a CharsetMetaAttributeValue object.
533        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
534
535        # And it will take on a value that reflects its current
536        # encoding.
537        self.assertEqual('utf8', charset.encode("utf8"))
538
539    def test_tag_with_no_attributes_can_have_attributes_added(self):
540        data = self.soup("<a>text</a>")
541        data.a['foo'] = 'bar'
542        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
543
544class XMLTreeBuilderSmokeTest(SoupTest):
545
546    def test_pickle_and_unpickle_identity(self):
547        # Pickling a tree, then unpickling it, yields a tree identical
548        # to the original.
549        tree = self.soup("<a><b>foo</a>")
550        dumped = pickle.dumps(tree, 2)
551        loaded = pickle.loads(dumped)
552        self.assertEqual(loaded.__class__, BeautifulSoup)
553        self.assertEqual(loaded.decode(), tree.decode())
554
555    def test_docstring_generated(self):
556        soup = self.soup("<root/>")
557        self.assertEqual(
558            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
559
560    def test_xml_declaration(self):
561        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
562        soup = self.soup(markup)
563        self.assertEqual(markup, soup.encode("utf8"))
564
565    def test_real_xhtml_document(self):
566        """A real XHTML document should come out *exactly* the same as it went in."""
567        markup = b"""<?xml version="1.0" encoding="utf-8"?>
568<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
569<html xmlns="http://www.w3.org/1999/xhtml">
570<head><title>Hello.</title></head>
571<body>Goodbye.</body>
572</html>"""
573        soup = self.soup(markup)
574        self.assertEqual(
575            soup.encode("utf-8"), markup)
576
577    def test_formatter_processes_script_tag_for_xml_documents(self):
578        doc = """
579  <script type="text/javascript">
580  </script>
581"""
582        soup = BeautifulSoup(doc, "lxml-xml")
583        # lxml would have stripped this while parsing, but we can add
584        # it later.
585        soup.script.string = 'console.log("< < hey > > ");'
586        encoded = soup.encode()
587        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
588
589    def test_can_parse_unicode_document(self):
590        markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
591        soup = self.soup(markup)
592        self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
593
594    def test_popping_namespaced_tag(self):
595        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
596        soup = self.soup(markup)
597        self.assertEqual(
598            str(soup.rss), markup)
599
600    def test_docstring_includes_correct_encoding(self):
601        soup = self.soup("<root/>")
602        self.assertEqual(
603            soup.encode("latin1"),
604            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
605
606    def test_large_xml_document(self):
607        """A large XML document should come out the same as it went in."""
608        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
609                  + b'0' * (2**12)
610                  + b'</root>')
611        soup = self.soup(markup)
612        self.assertEqual(soup.encode("utf-8"), markup)
613
614
615    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
616        self.assertSoupEquals("<p>", "<p/>")
617        self.assertSoupEquals("<p>foo</p>")
618
619    def test_namespaces_are_preserved(self):
620        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
621        soup = self.soup(markup)
622        root = soup.root
623        self.assertEqual("http://example.com/", root['xmlns:a'])
624        self.assertEqual("http://example.net/", root['xmlns:b'])
625
626    def test_closing_namespaced_tag(self):
627        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
628        soup = self.soup(markup)
629        self.assertEqual(str(soup.p), markup)
630
631    def test_namespaced_attributes(self):
632        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
633        soup = self.soup(markup)
634        self.assertEqual(str(soup.foo), markup)
635
636    def test_namespaced_attributes_xml_namespace(self):
637        markup = '<foo xml:lang="fr">bar</foo>'
638        soup = self.soup(markup)
639        self.assertEqual(str(soup.foo), markup)
640
641class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
642    """Smoke test for a tree builder that supports HTML5."""
643
644    def test_real_xhtml_document(self):
645        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
646        # XHTML documents in any particular way.
647        pass
648
649    def test_html_tags_have_namespace(self):
650        markup = "<a>"
651        soup = self.soup(markup)
652        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
653
654    def test_svg_tags_have_namespace(self):
655        markup = '<svg><circle/></svg>'
656        soup = self.soup(markup)
657        namespace = "http://www.w3.org/2000/svg"
658        self.assertEqual(namespace, soup.svg.namespace)
659        self.assertEqual(namespace, soup.circle.namespace)
660
661
662    def test_mathml_tags_have_namespace(self):
663        markup = '<math><msqrt>5</msqrt></math>'
664        soup = self.soup(markup)
665        namespace = 'http://www.w3.org/1998/Math/MathML'
666        self.assertEqual(namespace, soup.math.namespace)
667        self.assertEqual(namespace, soup.msqrt.namespace)
668
669    def test_xml_declaration_becomes_comment(self):
670        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
671        soup = self.soup(markup)
672        self.assertTrue(isinstance(soup.contents[0], Comment))
673        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
674        self.assertEqual("html", soup.contents[0].next_element.name)
675
676def skipIf(condition, reason):
677   def nothing(test, *args, **kwargs):
678       return None
679
680   def decorator(test_item):
681       if condition:
682           return nothing
683       else:
684           return test_item
685
686   return decorator
687