1# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12import copy
13import pickle
14import re
15import warnings
16from bs4 import BeautifulSoup
17from bs4.builder import builder_registry
18from bs4.element import (
19    PY3K,
20    CData,
21    Comment,
22    Declaration,
23    Doctype,
24    NavigableString,
25    SoupStrainer,
26    Tag,
27)
28from bs4.testing import SoupTest
29
30XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
31LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
32
33class TreeTest(SoupTest):
34
35    def assertSelects(self, tags, should_match):
36        """Make sure that the given tags have the correct text.
37
38        This is used in tests that define a bunch of tags, each
39        containing a single string, and then select certain strings by
40        some mechanism.
41        """
42        self.assertEqual([tag.string for tag in tags], should_match)
43
44    def assertSelectsIDs(self, tags, should_match):
45        """Make sure that the given tags have the correct IDs.
46
47        This is used in tests that define a bunch of tags, each
48        containing a single string, and then select certain strings by
49        some mechanism.
50        """
51        self.assertEqual([tag['id'] for tag in tags], should_match)
52
53
54class TestFind(TreeTest):
55    """Basic tests of the find() method.
56
57    find() just calls find_all() with limit=1, so it's not tested all
58    that thouroughly here.
59    """
60
61    def test_find_tag(self):
62        soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
63        self.assertEqual(soup.find("b").string, "2")
64
65    def test_unicode_text_find(self):
66        soup = self.soup('<h1>Räksmörgås</h1>')
67        self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
68
69    def test_unicode_attribute_find(self):
70        soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
71        str(soup)
72        self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
73
74
75    def test_find_everything(self):
76        """Test an optimization that finds all tags."""
77        soup = self.soup("<a>foo</a><b>bar</b>")
78        self.assertEqual(2, len(soup.find_all()))
79
80    def test_find_everything_with_name(self):
81        """Test an optimization that finds all tags with a given name."""
82        soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
83        self.assertEqual(2, len(soup.find_all('a')))
84
85class TestFindAll(TreeTest):
86    """Basic tests of the find_all() method."""
87
88    def test_find_all_text_nodes(self):
89        """You can search the tree for text nodes."""
90        soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
91        # Exact match.
92        self.assertEqual(soup.find_all(string="bar"), ["bar"])
93        self.assertEqual(soup.find_all(text="bar"), ["bar"])
94        # Match any of a number of strings.
95        self.assertEqual(
96            soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
97        # Match a regular expression.
98        self.assertEqual(soup.find_all(text=re.compile('.*')),
99                         ["Foo", "bar", '\xbb'])
100        # Match anything.
101        self.assertEqual(soup.find_all(text=True),
102                         ["Foo", "bar", '\xbb'])
103
104    def test_find_all_limit(self):
105        """You can limit the number of items returned by find_all."""
106        soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
107        self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
108        self.assertSelects(soup.find_all('a', limit=1), ["1"])
109        self.assertSelects(
110            soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
111
112        # A limit of 0 means no limit.
113        self.assertSelects(
114            soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
115
116    def test_calling_a_tag_is_calling_findall(self):
117        soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
118        self.assertSelects(soup('a', limit=1), ["1"])
119        self.assertSelects(soup.b(id="foo"), ["3"])
120
121    def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
122        soup = self.soup("<a></a>")
123        # Create a self-referential list.
124        l = []
125        l.append(l)
126
127        # Without special code in _normalize_search_value, this would cause infinite
128        # recursion.
129        self.assertEqual([], soup.find_all(l))
130
131    def test_find_all_resultset(self):
132        """All find_all calls return a ResultSet"""
133        soup = self.soup("<a></a>")
134        result = soup.find_all("a")
135        self.assertTrue(hasattr(result, "source"))
136
137        result = soup.find_all(True)
138        self.assertTrue(hasattr(result, "source"))
139
140        result = soup.find_all(text="foo")
141        self.assertTrue(hasattr(result, "source"))
142
143
144class TestFindAllBasicNamespaces(TreeTest):
145
146    def test_find_by_namespaced_name(self):
147        soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
148        self.assertEqual("4", soup.find("mathml:msqrt").string)
149        self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
150
151
152class TestFindAllByName(TreeTest):
153    """Test ways of finding tags by tag name."""
154
155    def setUp(self):
156        super(TreeTest, self).setUp()
157        self.tree =  self.soup("""<a>First tag.</a>
158                                  <b>Second tag.</b>
159                                  <c>Third <a>Nested tag.</a> tag.</c>""")
160
161    def test_find_all_by_tag_name(self):
162        # Find all the <a> tags.
163        self.assertSelects(
164            self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
165
166    def test_find_all_by_name_and_text(self):
167        self.assertSelects(
168            self.tree.find_all('a', text='First tag.'), ['First tag.'])
169
170        self.assertSelects(
171            self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
172
173        self.assertSelects(
174            self.tree.find_all('a', text=re.compile("tag")),
175            ['First tag.', 'Nested tag.'])
176
177
178    def test_find_all_on_non_root_element(self):
179        # You can call find_all on any node, not just the root.
180        self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
181
182    def test_calling_element_invokes_find_all(self):
183        self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
184
185    def test_find_all_by_tag_strainer(self):
186        self.assertSelects(
187            self.tree.find_all(SoupStrainer('a')),
188            ['First tag.', 'Nested tag.'])
189
190    def test_find_all_by_tag_names(self):
191        self.assertSelects(
192            self.tree.find_all(['a', 'b']),
193            ['First tag.', 'Second tag.', 'Nested tag.'])
194
195    def test_find_all_by_tag_dict(self):
196        self.assertSelects(
197            self.tree.find_all({'a' : True, 'b' : True}),
198            ['First tag.', 'Second tag.', 'Nested tag.'])
199
200    def test_find_all_by_tag_re(self):
201        self.assertSelects(
202            self.tree.find_all(re.compile('^[ab]$')),
203            ['First tag.', 'Second tag.', 'Nested tag.'])
204
205    def test_find_all_with_tags_matching_method(self):
206        # You can define an oracle method that determines whether
207        # a tag matches the search.
208        def id_matches_name(tag):
209            return tag.name == tag.get('id')
210
211        tree = self.soup("""<a id="a">Match 1.</a>
212                            <a id="1">Does not match.</a>
213                            <b id="b">Match 2.</a>""")
214
215        self.assertSelects(
216            tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
217
218
219class TestFindAllByAttribute(TreeTest):
220
221    def test_find_all_by_attribute_name(self):
222        # You can pass in keyword arguments to find_all to search by
223        # attribute.
224        tree = self.soup("""
225                         <a id="first">Matching a.</a>
226                         <a id="second">
227                          Non-matching <b id="first">Matching b.</b>a.
228                         </a>""")
229        self.assertSelects(tree.find_all(id='first'),
230                           ["Matching a.", "Matching b."])
231
232    def test_find_all_by_utf8_attribute_value(self):
233        peace = "םולש".encode("utf8")
234        data = '<a title="םולש"></a>'.encode("utf8")
235        soup = self.soup(data)
236        self.assertEqual([soup.a], soup.find_all(title=peace))
237        self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
238        self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
239
240    def test_find_all_by_attribute_dict(self):
241        # You can pass in a dictionary as the argument 'attrs'. This
242        # lets you search for attributes like 'name' (a fixed argument
243        # to find_all) and 'class' (a reserved word in Python.)
244        tree = self.soup("""
245                         <a name="name1" class="class1">Name match.</a>
246                         <a name="name2" class="class2">Class match.</a>
247                         <a name="name3" class="class3">Non-match.</a>
248                         <name1>A tag called 'name1'.</name1>
249                         """)
250
251        # This doesn't do what you want.
252        self.assertSelects(tree.find_all(name='name1'),
253                           ["A tag called 'name1'."])
254        # This does what you want.
255        self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
256                           ["Name match."])
257
258        self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
259                           ["Class match."])
260
261    def test_find_all_by_class(self):
262        tree = self.soup("""
263                         <a class="1">Class 1.</a>
264                         <a class="2">Class 2.</a>
265                         <b class="1">Class 1.</b>
266                         <c class="3 4">Class 3 and 4.</c>
267                         """)
268
269        # Passing in the class_ keyword argument will search against
270        # the 'class' attribute.
271        self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
272        self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
273        self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
274
275        # Passing in a string to 'attrs' will also search the CSS class.
276        self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
277        self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
278        self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
279        self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
280
281    def test_find_by_class_when_multiple_classes_present(self):
282        tree = self.soup("<gar class='foo bar'>Found it</gar>")
283
284        f = tree.find_all("gar", class_=re.compile("o"))
285        self.assertSelects(f, ["Found it"])
286
287        f = tree.find_all("gar", class_=re.compile("a"))
288        self.assertSelects(f, ["Found it"])
289
290        # Since the class is not the string "foo bar", but the two
291        # strings "foo" and "bar", this will not find anything.
292        f = tree.find_all("gar", class_=re.compile("o b"))
293        self.assertSelects(f, [])
294
295    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
296        soup = self.soup("<a class='bar'>Found it</a>")
297
298        self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
299
300        def big_attribute_value(value):
301            return len(value) > 3
302
303        self.assertSelects(soup.find_all("a", big_attribute_value), [])
304
305        def small_attribute_value(value):
306            return len(value) <= 3
307
308        self.assertSelects(
309            soup.find_all("a", small_attribute_value), ["Found it"])
310
311    def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
312        soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
313        a, a2 = soup.find_all("a")
314        self.assertEqual([a, a2], soup.find_all("a", "foo"))
315        self.assertEqual([a], soup.find_all("a", "bar"))
316
317        # If you specify the class as a string that contains a
318        # space, only that specific value will be found.
319        self.assertEqual([a], soup.find_all("a", class_="foo bar"))
320        self.assertEqual([a], soup.find_all("a", "foo bar"))
321        self.assertEqual([], soup.find_all("a", "bar foo"))
322
323    def test_find_all_by_attribute_soupstrainer(self):
324        tree = self.soup("""
325                         <a id="first">Match.</a>
326                         <a id="second">Non-match.</a>""")
327
328        strainer = SoupStrainer(attrs={'id' : 'first'})
329        self.assertSelects(tree.find_all(strainer), ['Match.'])
330
331    def test_find_all_with_missing_atribute(self):
332        # You can pass in None as the value of an attribute to find_all.
333        # This will match tags that do not have that attribute set.
334        tree = self.soup("""<a id="1">ID present.</a>
335                            <a>No ID present.</a>
336                            <a id="">ID is empty.</a>""")
337        self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
338
339    def test_find_all_with_defined_attribute(self):
340        # You can pass in None as the value of an attribute to find_all.
341        # This will match tags that have that attribute set to any value.
342        tree = self.soup("""<a id="1">ID present.</a>
343                            <a>No ID present.</a>
344                            <a id="">ID is empty.</a>""")
345        self.assertSelects(
346            tree.find_all(id=True), ["ID present.", "ID is empty."])
347
348    def test_find_all_with_numeric_attribute(self):
349        # If you search for a number, it's treated as a string.
350        tree = self.soup("""<a id=1>Unquoted attribute.</a>
351                            <a id="1">Quoted attribute.</a>""")
352
353        expected = ["Unquoted attribute.", "Quoted attribute."]
354        self.assertSelects(tree.find_all(id=1), expected)
355        self.assertSelects(tree.find_all(id="1"), expected)
356
357    def test_find_all_with_list_attribute_values(self):
358        # You can pass a list of attribute values instead of just one,
359        # and you'll get tags that match any of the values.
360        tree = self.soup("""<a id="1">1</a>
361                            <a id="2">2</a>
362                            <a id="3">3</a>
363                            <a>No ID.</a>""")
364        self.assertSelects(tree.find_all(id=["1", "3", "4"]),
365                           ["1", "3"])
366
367    def test_find_all_with_regular_expression_attribute_value(self):
368        # You can pass a regular expression as an attribute value, and
369        # you'll get tags whose values for that attribute match the
370        # regular expression.
371        tree = self.soup("""<a id="a">One a.</a>
372                            <a id="aa">Two as.</a>
373                            <a id="ab">Mixed as and bs.</a>
374                            <a id="b">One b.</a>
375                            <a>No ID.</a>""")
376
377        self.assertSelects(tree.find_all(id=re.compile("^a+$")),
378                           ["One a.", "Two as."])
379
380    def test_find_by_name_and_containing_string(self):
381        soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
382        a = soup.a
383
384        self.assertEqual([a], soup.find_all("a", text="foo"))
385        self.assertEqual([], soup.find_all("a", text="bar"))
386        self.assertEqual([], soup.find_all("a", text="bar"))
387
388    def test_find_by_name_and_containing_string_when_string_is_buried(self):
389        soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
390        self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
391
392    def test_find_by_attribute_and_containing_string(self):
393        soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
394        a = soup.a
395
396        self.assertEqual([a], soup.find_all(id=2, text="foo"))
397        self.assertEqual([], soup.find_all(id=1, text="bar"))
398
399
400
401
402class TestIndex(TreeTest):
403    """Test Tag.index"""
404    def test_index(self):
405        tree = self.soup("""<div>
406                            <a>Identical</a>
407                            <b>Not identical</b>
408                            <a>Identical</a>
409
410                            <c><d>Identical with child</d></c>
411                            <b>Also not identical</b>
412                            <c><d>Identical with child</d></c>
413                            </div>""")
414        div = tree.div
415        for i, element in enumerate(div.contents):
416            self.assertEqual(i, div.index(element))
417        self.assertRaises(ValueError, tree.index, 1)
418
419
420class TestParentOperations(TreeTest):
421    """Test navigation and searching through an element's parents."""
422
423    def setUp(self):
424        super(TestParentOperations, self).setUp()
425        self.tree = self.soup('''<ul id="empty"></ul>
426                                 <ul id="top">
427                                  <ul id="middle">
428                                   <ul id="bottom">
429                                    <b>Start here</b>
430                                   </ul>
431                                  </ul>''')
432        self.start = self.tree.b
433
434
435    def test_parent(self):
436        self.assertEqual(self.start.parent['id'], 'bottom')
437        self.assertEqual(self.start.parent.parent['id'], 'middle')
438        self.assertEqual(self.start.parent.parent.parent['id'], 'top')
439
440    def test_parent_of_top_tag_is_soup_object(self):
441        top_tag = self.tree.contents[0]
442        self.assertEqual(top_tag.parent, self.tree)
443
444    def test_soup_object_has_no_parent(self):
445        self.assertEqual(None, self.tree.parent)
446
447    def test_find_parents(self):
448        self.assertSelectsIDs(
449            self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
450        self.assertSelectsIDs(
451            self.start.find_parents('ul', id="middle"), ['middle'])
452
453    def test_find_parent(self):
454        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
455        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
456
457    def test_parent_of_text_element(self):
458        text = self.tree.find(text="Start here")
459        self.assertEqual(text.parent.name, 'b')
460
461    def test_text_element_find_parent(self):
462        text = self.tree.find(text="Start here")
463        self.assertEqual(text.find_parent('ul')['id'], 'bottom')
464
465    def test_parent_generator(self):
466        parents = [parent['id'] for parent in self.start.parents
467                   if parent is not None and 'id' in parent.attrs]
468        self.assertEqual(parents, ['bottom', 'middle', 'top'])
469
470
471class ProximityTest(TreeTest):
472
473    def setUp(self):
474        super(TreeTest, self).setUp()
475        self.tree = self.soup(
476            '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
477
478
479class TestNextOperations(ProximityTest):
480
481    def setUp(self):
482        super(TestNextOperations, self).setUp()
483        self.start = self.tree.b
484
485    def test_next(self):
486        self.assertEqual(self.start.next_element, "One")
487        self.assertEqual(self.start.next_element.next_element['id'], "2")
488
489    def test_next_of_last_item_is_none(self):
490        last = self.tree.find(text="Three")
491        self.assertEqual(last.next_element, None)
492
493    def test_next_of_root_is_none(self):
494        # The document root is outside the next/previous chain.
495        self.assertEqual(self.tree.next_element, None)
496
497    def test_find_all_next(self):
498        self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
499        self.start.find_all_next(id=3)
500        self.assertSelects(self.start.find_all_next(id=3), ["Three"])
501
502    def test_find_next(self):
503        self.assertEqual(self.start.find_next('b')['id'], '2')
504        self.assertEqual(self.start.find_next(text="Three"), "Three")
505
506    def test_find_next_for_text_element(self):
507        text = self.tree.find(text="One")
508        self.assertEqual(text.find_next("b").string, "Two")
509        self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
510
511    def test_next_generator(self):
512        start = self.tree.find(text="Two")
513        successors = [node for node in start.next_elements]
514        # There are two successors: the final <b> tag and its text contents.
515        tag, contents = successors
516        self.assertEqual(tag['id'], '3')
517        self.assertEqual(contents, "Three")
518
519class TestPreviousOperations(ProximityTest):
520
521    def setUp(self):
522        super(TestPreviousOperations, self).setUp()
523        self.end = self.tree.find(text="Three")
524
525    def test_previous(self):
526        self.assertEqual(self.end.previous_element['id'], "3")
527        self.assertEqual(self.end.previous_element.previous_element, "Two")
528
529    def test_previous_of_first_item_is_none(self):
530        first = self.tree.find('html')
531        self.assertEqual(first.previous_element, None)
532
533    def test_previous_of_root_is_none(self):
534        # The document root is outside the next/previous chain.
535        # XXX This is broken!
536        #self.assertEqual(self.tree.previous_element, None)
537        pass
538
539    def test_find_all_previous(self):
540        # The <b> tag containing the "Three" node is the predecessor
541        # of the "Three" node itself, which is why "Three" shows up
542        # here.
543        self.assertSelects(
544            self.end.find_all_previous('b'), ["Three", "Two", "One"])
545        self.assertSelects(self.end.find_all_previous(id=1), ["One"])
546
547    def test_find_previous(self):
548        self.assertEqual(self.end.find_previous('b')['id'], '3')
549        self.assertEqual(self.end.find_previous(text="One"), "One")
550
551    def test_find_previous_for_text_element(self):
552        text = self.tree.find(text="Three")
553        self.assertEqual(text.find_previous("b").string, "Three")
554        self.assertSelects(
555            text.find_all_previous("b"), ["Three", "Two", "One"])
556
557    def test_previous_generator(self):
558        start = self.tree.find(text="One")
559        predecessors = [node for node in start.previous_elements]
560
561        # There are four predecessors: the <b> tag containing "One"
562        # the <body> tag, the <head> tag, and the <html> tag.
563        b, body, head, html = predecessors
564        self.assertEqual(b['id'], '1')
565        self.assertEqual(body.name, "body")
566        self.assertEqual(head.name, "head")
567        self.assertEqual(html.name, "html")
568
569
570class SiblingTest(TreeTest):
571
572    def setUp(self):
573        super(SiblingTest, self).setUp()
574        markup = '''<html>
575                    <span id="1">
576                     <span id="1.1"></span>
577                    </span>
578                    <span id="2">
579                     <span id="2.1"></span>
580                    </span>
581                    <span id="3">
582                     <span id="3.1"></span>
583                    </span>
584                    <span id="4"></span>
585                    </html>'''
586        # All that whitespace looks good but makes the tests more
587        # difficult. Get rid of it.
588        markup = re.compile(r"\n\s*").sub("", markup)
589        self.tree = self.soup(markup)
590
591
592class TestNextSibling(SiblingTest):
593
594    def setUp(self):
595        super(TestNextSibling, self).setUp()
596        self.start = self.tree.find(id="1")
597
598    def test_next_sibling_of_root_is_none(self):
599        self.assertEqual(self.tree.next_sibling, None)
600
601    def test_next_sibling(self):
602        self.assertEqual(self.start.next_sibling['id'], '2')
603        self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
604
605        # Note the difference between next_sibling and next_element.
606        self.assertEqual(self.start.next_element['id'], '1.1')
607
608    def test_next_sibling_may_not_exist(self):
609        self.assertEqual(self.tree.html.next_sibling, None)
610
611        nested_span = self.tree.find(id="1.1")
612        self.assertEqual(nested_span.next_sibling, None)
613
614        last_span = self.tree.find(id="4")
615        self.assertEqual(last_span.next_sibling, None)
616
617    def test_find_next_sibling(self):
618        self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
619
620    def test_next_siblings(self):
621        self.assertSelectsIDs(self.start.find_next_siblings("span"),
622                              ['2', '3', '4'])
623
624        self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
625
626    def test_next_sibling_for_text_element(self):
627        soup = self.soup("Foo<b>bar</b>baz")
628        start = soup.find(text="Foo")
629        self.assertEqual(start.next_sibling.name, 'b')
630        self.assertEqual(start.next_sibling.next_sibling, 'baz')
631
632        self.assertSelects(start.find_next_siblings('b'), ['bar'])
633        self.assertEqual(start.find_next_sibling(text="baz"), "baz")
634        self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
635
636
637class TestPreviousSibling(SiblingTest):
638
639    def setUp(self):
640        super(TestPreviousSibling, self).setUp()
641        self.end = self.tree.find(id="4")
642
643    def test_previous_sibling_of_root_is_none(self):
644        self.assertEqual(self.tree.previous_sibling, None)
645
646    def test_previous_sibling(self):
647        self.assertEqual(self.end.previous_sibling['id'], '3')
648        self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
649
650        # Note the difference between previous_sibling and previous_element.
651        self.assertEqual(self.end.previous_element['id'], '3.1')
652
653    def test_previous_sibling_may_not_exist(self):
654        self.assertEqual(self.tree.html.previous_sibling, None)
655
656        nested_span = self.tree.find(id="1.1")
657        self.assertEqual(nested_span.previous_sibling, None)
658
659        first_span = self.tree.find(id="1")
660        self.assertEqual(first_span.previous_sibling, None)
661
662    def test_find_previous_sibling(self):
663        self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
664
665    def test_previous_siblings(self):
666        self.assertSelectsIDs(self.end.find_previous_siblings("span"),
667                              ['3', '2', '1'])
668
669        self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
670
671    def test_previous_sibling_for_text_element(self):
672        soup = self.soup("Foo<b>bar</b>baz")
673        start = soup.find(text="baz")
674        self.assertEqual(start.previous_sibling.name, 'b')
675        self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
676
677        self.assertSelects(start.find_previous_siblings('b'), ['bar'])
678        self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
679        self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
680
681
682class TestTagCreation(SoupTest):
683    """Test the ability to create new tags."""
684    def test_new_tag(self):
685        soup = self.soup("")
686        new_tag = soup.new_tag("foo", bar="baz")
687        self.assertTrue(isinstance(new_tag, Tag))
688        self.assertEqual("foo", new_tag.name)
689        self.assertEqual(dict(bar="baz"), new_tag.attrs)
690        self.assertEqual(None, new_tag.parent)
691
692    def test_tag_inherits_self_closing_rules_from_builder(self):
693        if XML_BUILDER_PRESENT:
694            xml_soup = BeautifulSoup("", "lxml-xml")
695            xml_br = xml_soup.new_tag("br")
696            xml_p = xml_soup.new_tag("p")
697
698            # Both the <br> and <p> tag are empty-element, just because
699            # they have no contents.
700            self.assertEqual(b"<br/>", xml_br.encode())
701            self.assertEqual(b"<p/>", xml_p.encode())
702
703        html_soup = BeautifulSoup("", "html.parser")
704        html_br = html_soup.new_tag("br")
705        html_p = html_soup.new_tag("p")
706
707        # The HTML builder users HTML's rules about which tags are
708        # empty-element tags, and the new tags reflect these rules.
709        self.assertEqual(b"<br/>", html_br.encode())
710        self.assertEqual(b"<p></p>", html_p.encode())
711
712    def test_new_string_creates_navigablestring(self):
713        soup = self.soup("")
714        s = soup.new_string("foo")
715        self.assertEqual("foo", s)
716        self.assertTrue(isinstance(s, NavigableString))
717
718    def test_new_string_can_create_navigablestring_subclass(self):
719        soup = self.soup("")
720        s = soup.new_string("foo", Comment)
721        self.assertEqual("foo", s)
722        self.assertTrue(isinstance(s, Comment))
723
724class TestTreeModification(SoupTest):
725
726    def test_attribute_modification(self):
727        soup = self.soup('<a id="1"></a>')
728        soup.a['id'] = 2
729        self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
730        del(soup.a['id'])
731        self.assertEqual(soup.decode(), self.document_for('<a></a>'))
732        soup.a['id2'] = 'foo'
733        self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
734
735    def test_new_tag_creation(self):
736        builder = builder_registry.lookup('html')()
737        soup = self.soup("<body></body>", builder=builder)
738        a = Tag(soup, builder, 'a')
739        ol = Tag(soup, builder, 'ol')
740        a['href'] = 'http://foo.com/'
741        soup.body.insert(0, a)
742        soup.body.insert(1, ol)
743        self.assertEqual(
744            soup.body.encode(),
745            b'<body><a href="http://foo.com/"></a><ol></ol></body>')
746
747    def test_append_to_contents_moves_tag(self):
748        doc = """<p id="1">Don't leave me <b>here</b>.</p>
749                <p id="2">Don\'t leave!</p>"""
750        soup = self.soup(doc)
751        second_para = soup.find(id='2')
752        bold = soup.b
753
754        # Move the <b> tag to the end of the second paragraph.
755        soup.find(id='2').append(soup.b)
756
757        # The <b> tag is now a child of the second paragraph.
758        self.assertEqual(bold.parent, second_para)
759
760        self.assertEqual(
761            soup.decode(), self.document_for(
762                '<p id="1">Don\'t leave me .</p>\n'
763                '<p id="2">Don\'t leave!<b>here</b></p>'))
764
765    def test_replace_with_returns_thing_that_was_replaced(self):
766        text = "<a></a><b><c></c></b>"
767        soup = self.soup(text)
768        a = soup.a
769        new_a = a.replace_with(soup.c)
770        self.assertEqual(a, new_a)
771
772    def test_unwrap_returns_thing_that_was_replaced(self):
773        text = "<a><b></b><c></c></a>"
774        soup = self.soup(text)
775        a = soup.a
776        new_a = a.unwrap()
777        self.assertEqual(a, new_a)
778
779    def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
780        soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
781        a = soup.a
782        a.extract()
783        self.assertEqual(None, a.parent)
784        self.assertRaises(ValueError, a.unwrap)
785        self.assertRaises(ValueError, a.replace_with, soup.c)
786
787    def test_replace_tag_with_itself(self):
788        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
789        soup = self.soup(text)
790        c = soup.c
791        soup.c.replace_with(c)
792        self.assertEqual(soup.decode(), self.document_for(text))
793
794    def test_replace_tag_with_its_parent_raises_exception(self):
795        text = "<a><b></b></a>"
796        soup = self.soup(text)
797        self.assertRaises(ValueError, soup.b.replace_with, soup.a)
798
799    def test_insert_tag_into_itself_raises_exception(self):
800        text = "<a><b></b></a>"
801        soup = self.soup(text)
802        self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
803
804    def test_replace_with_maintains_next_element_throughout(self):
805        soup = self.soup('<p><a>one</a><b>three</b></p>')
806        a = soup.a
807        b = a.contents[0]
808        # Make it so the <a> tag has two text children.
809        a.insert(1, "two")
810
811        # Now replace each one with the empty string.
812        left, right = a.contents
813        left.replaceWith('')
814        right.replaceWith('')
815
816        # The <b> tag is still connected to the tree.
817        self.assertEqual("three", soup.b.string)
818
819    def test_replace_final_node(self):
820        soup = self.soup("<b>Argh!</b>")
821        soup.find(text="Argh!").replace_with("Hooray!")
822        new_text = soup.find(text="Hooray!")
823        b = soup.b
824        self.assertEqual(new_text.previous_element, b)
825        self.assertEqual(new_text.parent, b)
826        self.assertEqual(new_text.previous_element.next_element, new_text)
827        self.assertEqual(new_text.next_element, None)
828
829    def test_consecutive_text_nodes(self):
830        # A builder should never create two consecutive text nodes,
831        # but if you insert one next to another, Beautiful Soup will
832        # handle it correctly.
833        soup = self.soup("<a><b>Argh!</b><c></c></a>")
834        soup.b.insert(1, "Hooray!")
835
836        self.assertEqual(
837            soup.decode(), self.document_for(
838                "<a><b>Argh!Hooray!</b><c></c></a>"))
839
840        new_text = soup.find(text="Hooray!")
841        self.assertEqual(new_text.previous_element, "Argh!")
842        self.assertEqual(new_text.previous_element.next_element, new_text)
843
844        self.assertEqual(new_text.previous_sibling, "Argh!")
845        self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
846
847        self.assertEqual(new_text.next_sibling, None)
848        self.assertEqual(new_text.next_element, soup.c)
849
850    def test_insert_string(self):
851        soup = self.soup("<a></a>")
852        soup.a.insert(0, "bar")
853        soup.a.insert(0, "foo")
854        # The string were added to the tag.
855        self.assertEqual(["foo", "bar"], soup.a.contents)
856        # And they were converted to NavigableStrings.
857        self.assertEqual(soup.a.contents[0].next_element, "bar")
858
859    def test_insert_tag(self):
860        builder = self.default_builder
861        soup = self.soup(
862            "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
863        magic_tag = Tag(soup, builder, 'magictag')
864        magic_tag.insert(0, "the")
865        soup.a.insert(1, magic_tag)
866
867        self.assertEqual(
868            soup.decode(), self.document_for(
869                "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
870
871        # Make sure all the relationships are hooked up correctly.
872        b_tag = soup.b
873        self.assertEqual(b_tag.next_sibling, magic_tag)
874        self.assertEqual(magic_tag.previous_sibling, b_tag)
875
876        find = b_tag.find(text="Find")
877        self.assertEqual(find.next_element, magic_tag)
878        self.assertEqual(magic_tag.previous_element, find)
879
880        c_tag = soup.c
881        self.assertEqual(magic_tag.next_sibling, c_tag)
882        self.assertEqual(c_tag.previous_sibling, magic_tag)
883
884        the = magic_tag.find(text="the")
885        self.assertEqual(the.parent, magic_tag)
886        self.assertEqual(the.next_element, c_tag)
887        self.assertEqual(c_tag.previous_element, the)
888
889    def test_append_child_thats_already_at_the_end(self):
890        data = "<a><b></b></a>"
891        soup = self.soup(data)
892        soup.a.append(soup.b)
893        self.assertEqual(data, soup.decode())
894
895    def test_move_tag_to_beginning_of_parent(self):
896        data = "<a><b></b><c></c><d></d></a>"
897        soup = self.soup(data)
898        soup.a.insert(0, soup.d)
899        self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
900
901    def test_insert_works_on_empty_element_tag(self):
902        # This is a little strange, since most HTML parsers don't allow
903        # markup like this to come through. But in general, we don't
904        # know what the parser would or wouldn't have allowed, so
905        # I'm letting this succeed for now.
906        soup = self.soup("<br/>")
907        soup.br.insert(1, "Contents")
908        self.assertEqual(str(soup.br), "<br>Contents</br>")
909
910    def test_insert_before(self):
911        soup = self.soup("<a>foo</a><b>bar</b>")
912        soup.b.insert_before("BAZ")
913        soup.a.insert_before("QUUX")
914        self.assertEqual(
915            soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
916
917        soup.a.insert_before(soup.b)
918        self.assertEqual(
919            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
920
921    def test_insert_after(self):
922        soup = self.soup("<a>foo</a><b>bar</b>")
923        soup.b.insert_after("BAZ")
924        soup.a.insert_after("QUUX")
925        self.assertEqual(
926            soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
927        soup.b.insert_after(soup.a)
928        self.assertEqual(
929            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
930
931    def test_insert_after_raises_exception_if_after_has_no_meaning(self):
932        soup = self.soup("")
933        tag = soup.new_tag("a")
934        string = soup.new_string("")
935        self.assertRaises(ValueError, string.insert_after, tag)
936        self.assertRaises(NotImplementedError, soup.insert_after, tag)
937        self.assertRaises(ValueError, tag.insert_after, tag)
938
939    def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
940        soup = self.soup("")
941        tag = soup.new_tag("a")
942        string = soup.new_string("")
943        self.assertRaises(ValueError, string.insert_before, tag)
944        self.assertRaises(NotImplementedError, soup.insert_before, tag)
945        self.assertRaises(ValueError, tag.insert_before, tag)
946
947    def test_replace_with(self):
948        soup = self.soup(
949                "<p>There's <b>no</b> business like <b>show</b> business</p>")
950        no, show = soup.find_all('b')
951        show.replace_with(no)
952        self.assertEqual(
953            soup.decode(),
954            self.document_for(
955                "<p>There's  business like <b>no</b> business</p>"))
956
957        self.assertEqual(show.parent, None)
958        self.assertEqual(no.parent, soup.p)
959        self.assertEqual(no.next_element, "no")
960        self.assertEqual(no.next_sibling, " business")
961
962    def test_replace_first_child(self):
963        data = "<a><b></b><c></c></a>"
964        soup = self.soup(data)
965        soup.b.replace_with(soup.c)
966        self.assertEqual("<a><c></c></a>", soup.decode())
967
968    def test_replace_last_child(self):
969        data = "<a><b></b><c></c></a>"
970        soup = self.soup(data)
971        soup.c.replace_with(soup.b)
972        self.assertEqual("<a><b></b></a>", soup.decode())
973
974    def test_nested_tag_replace_with(self):
975        soup = self.soup(
976            """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
977
978        # Replace the entire <b> tag and its contents ("reserve the
979        # right") with the <f> tag ("refuse").
980        remove_tag = soup.b
981        move_tag = soup.f
982        remove_tag.replace_with(move_tag)
983
984        self.assertEqual(
985            soup.decode(), self.document_for(
986                "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
987
988        # The <b> tag is now an orphan.
989        self.assertEqual(remove_tag.parent, None)
990        self.assertEqual(remove_tag.find(text="right").next_element, None)
991        self.assertEqual(remove_tag.previous_element, None)
992        self.assertEqual(remove_tag.next_sibling, None)
993        self.assertEqual(remove_tag.previous_sibling, None)
994
995        # The <f> tag is now connected to the <a> tag.
996        self.assertEqual(move_tag.parent, soup.a)
997        self.assertEqual(move_tag.previous_element, "We")
998        self.assertEqual(move_tag.next_element.next_element, soup.e)
999        self.assertEqual(move_tag.next_sibling, None)
1000
1001        # The gap where the <f> tag used to be has been mended, and
1002        # the word "to" is now connected to the <g> tag.
1003        to_text = soup.find(text="to")
1004        g_tag = soup.g
1005        self.assertEqual(to_text.next_element, g_tag)
1006        self.assertEqual(to_text.next_sibling, g_tag)
1007        self.assertEqual(g_tag.previous_element, to_text)
1008        self.assertEqual(g_tag.previous_sibling, to_text)
1009
1010    def test_unwrap(self):
1011        tree = self.soup("""
1012            <p>Unneeded <em>formatting</em> is unneeded</p>
1013            """)
1014        tree.em.unwrap()
1015        self.assertEqual(tree.em, None)
1016        self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1017
1018    def test_wrap(self):
1019        soup = self.soup("I wish I was bold.")
1020        value = soup.string.wrap(soup.new_tag("b"))
1021        self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1022        self.assertEqual(
1023            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1024
1025    def test_wrap_extracts_tag_from_elsewhere(self):
1026        soup = self.soup("<b></b>I wish I was bold.")
1027        soup.b.next_sibling.wrap(soup.b)
1028        self.assertEqual(
1029            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1030
1031    def test_wrap_puts_new_contents_at_the_end(self):
1032        soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1033        soup.b.next_sibling.wrap(soup.b)
1034        self.assertEqual(2, len(soup.b.contents))
1035        self.assertEqual(
1036            soup.decode(), self.document_for(
1037                "<b>I like being bold.I wish I was bold.</b>"))
1038
1039    def test_extract(self):
1040        soup = self.soup(
1041            '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1042
1043        self.assertEqual(len(soup.body.contents), 3)
1044        extracted = soup.find(id="nav").extract()
1045
1046        self.assertEqual(
1047            soup.decode(), "<html><body>Some content.  More content.</body></html>")
1048        self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1049
1050        # The extracted tag is now an orphan.
1051        self.assertEqual(len(soup.body.contents), 2)
1052        self.assertEqual(extracted.parent, None)
1053        self.assertEqual(extracted.previous_element, None)
1054        self.assertEqual(extracted.next_element.next_element, None)
1055
1056        # The gap where the extracted tag used to be has been mended.
1057        content_1 = soup.find(text="Some content. ")
1058        content_2 = soup.find(text=" More content.")
1059        self.assertEqual(content_1.next_element, content_2)
1060        self.assertEqual(content_1.next_sibling, content_2)
1061        self.assertEqual(content_2.previous_element, content_1)
1062        self.assertEqual(content_2.previous_sibling, content_1)
1063
1064    def test_extract_distinguishes_between_identical_strings(self):
1065        soup = self.soup("<a>foo</a><b>bar</b>")
1066        foo_1 = soup.a.string
1067        bar_1 = soup.b.string
1068        foo_2 = soup.new_string("foo")
1069        bar_2 = soup.new_string("bar")
1070        soup.a.append(foo_2)
1071        soup.b.append(bar_2)
1072
1073        # Now there are two identical strings in the <a> tag, and two
1074        # in the <b> tag. Let's remove the first "foo" and the second
1075        # "bar".
1076        foo_1.extract()
1077        bar_2.extract()
1078        self.assertEqual(foo_2, soup.a.string)
1079        self.assertEqual(bar_2, soup.b.string)
1080
1081    def test_extract_multiples_of_same_tag(self):
1082        soup = self.soup("""
1083<html>
1084<head>
1085<script>foo</script>
1086</head>
1087<body>
1088 <script>bar</script>
1089 <a></a>
1090</body>
1091<script>baz</script>
1092</html>""")
1093        [soup.script.extract() for i in soup.find_all("script")]
1094        self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1095
1096
1097    def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1098        soup = self.soup(
1099 '<html>\n'
1100 '<body>hi</body>\n'
1101 '</html>')
1102        soup.find('body').extract()
1103        self.assertEqual(None, soup.find('body'))
1104
1105
1106    def test_clear(self):
1107        """Tag.clear()"""
1108        soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1109        # clear using extract()
1110        a = soup.a
1111        soup.p.clear()
1112        self.assertEqual(len(soup.p.contents), 0)
1113        self.assertTrue(hasattr(a, "contents"))
1114
1115        # clear using decompose()
1116        em = a.em
1117        a.clear(decompose=True)
1118        self.assertEqual(0, len(em.contents))
1119
1120    def test_string_set(self):
1121        """Tag.string = 'string'"""
1122        soup = self.soup("<a></a> <b><c></c></b>")
1123        soup.a.string = "foo"
1124        self.assertEqual(soup.a.contents, ["foo"])
1125        soup.b.string = "bar"
1126        self.assertEqual(soup.b.contents, ["bar"])
1127
1128    def test_string_set_does_not_affect_original_string(self):
1129        soup = self.soup("<a><b>foo</b><c>bar</c>")
1130        soup.b.string = soup.c.string
1131        self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1132
1133    def test_set_string_preserves_class_of_string(self):
1134        soup = self.soup("<a></a>")
1135        cdata = CData("foo")
1136        soup.a.string = cdata
1137        self.assertTrue(isinstance(soup.a.string, CData))
1138
1139class TestElementObjects(SoupTest):
1140    """Test various features of element objects."""
1141
1142    def test_len(self):
1143        """The length of an element is its number of children."""
1144        soup = self.soup("<top>1<b>2</b>3</top>")
1145
1146        # The BeautifulSoup object itself contains one element: the
1147        # <top> tag.
1148        self.assertEqual(len(soup.contents), 1)
1149        self.assertEqual(len(soup), 1)
1150
1151        # The <top> tag contains three elements: the text node "1", the
1152        # <b> tag, and the text node "3".
1153        self.assertEqual(len(soup.top), 3)
1154        self.assertEqual(len(soup.top.contents), 3)
1155
1156    def test_member_access_invokes_find(self):
1157        """Accessing a Python member .foo invokes find('foo')"""
1158        soup = self.soup('<b><i></i></b>')
1159        self.assertEqual(soup.b, soup.find('b'))
1160        self.assertEqual(soup.b.i, soup.find('b').find('i'))
1161        self.assertEqual(soup.a, None)
1162
1163    def test_deprecated_member_access(self):
1164        soup = self.soup('<b><i></i></b>')
1165        with warnings.catch_warnings(record=True) as w:
1166            tag = soup.bTag
1167        self.assertEqual(soup.b, tag)
1168        self.assertEqual(
1169            '.bTag is deprecated, use .find("b") instead.',
1170            str(w[0].message))
1171
1172    def test_has_attr(self):
1173        """has_attr() checks for the presence of an attribute.
1174
1175        Please note note: has_attr() is different from
1176        __in__. has_attr() checks the tag's attributes and __in__
1177        checks the tag's chidlren.
1178        """
1179        soup = self.soup("<foo attr='bar'>")
1180        self.assertTrue(soup.foo.has_attr('attr'))
1181        self.assertFalse(soup.foo.has_attr('attr2'))
1182
1183
1184    def test_attributes_come_out_in_alphabetical_order(self):
1185        markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1186        self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1187
1188    def test_string(self):
1189        # A tag that contains only a text node makes that node
1190        # available as .string.
1191        soup = self.soup("<b>foo</b>")
1192        self.assertEqual(soup.b.string, 'foo')
1193
1194    def test_empty_tag_has_no_string(self):
1195        # A tag with no children has no .stirng.
1196        soup = self.soup("<b></b>")
1197        self.assertEqual(soup.b.string, None)
1198
1199    def test_tag_with_multiple_children_has_no_string(self):
1200        # A tag with no children has no .string.
1201        soup = self.soup("<a>foo<b></b><b></b></b>")
1202        self.assertEqual(soup.b.string, None)
1203
1204        soup = self.soup("<a>foo<b></b>bar</b>")
1205        self.assertEqual(soup.b.string, None)
1206
1207        # Even if all the children are strings, due to trickery,
1208        # it won't work--but this would be a good optimization.
1209        soup = self.soup("<a>foo</b>")
1210        soup.a.insert(1, "bar")
1211        self.assertEqual(soup.a.string, None)
1212
1213    def test_tag_with_recursive_string_has_string(self):
1214        # A tag with a single child which has a .string inherits that
1215        # .string.
1216        soup = self.soup("<a><b>foo</b></a>")
1217        self.assertEqual(soup.a.string, "foo")
1218        self.assertEqual(soup.string, "foo")
1219
1220    def test_lack_of_string(self):
1221        """Only a tag containing a single text node has a .string."""
1222        soup = self.soup("<b>f<i>e</i>o</b>")
1223        self.assertFalse(soup.b.string)
1224
1225        soup = self.soup("<b></b>")
1226        self.assertFalse(soup.b.string)
1227
1228    def test_all_text(self):
1229        """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1230        soup = self.soup("<a>a<b>r</b>   <r> t </r></a>")
1231        self.assertEqual(soup.a.text, "ar  t ")
1232        self.assertEqual(soup.a.get_text(strip=True), "art")
1233        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1234        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1235
1236    def test_get_text_ignores_comments(self):
1237        soup = self.soup("foo<!--IGNORE-->bar")
1238        self.assertEqual(soup.get_text(), "foobar")
1239
1240        self.assertEqual(
1241            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1242        self.assertEqual(
1243            soup.get_text(types=None), "fooIGNOREbar")
1244
1245    def test_all_strings_ignores_comments(self):
1246        soup = self.soup("foo<!--IGNORE-->bar")
1247        self.assertEqual(['foo', 'bar'], list(soup.strings))
1248
1249class TestCDAtaListAttributes(SoupTest):
1250
1251    """Testing cdata-list attributes like 'class'.
1252    """
1253    def test_single_value_becomes_list(self):
1254        soup = self.soup("<a class='foo'>")
1255        self.assertEqual(["foo"],soup.a['class'])
1256
1257    def test_multiple_values_becomes_list(self):
1258        soup = self.soup("<a class='foo bar'>")
1259        self.assertEqual(["foo", "bar"], soup.a['class'])
1260
1261    def test_multiple_values_separated_by_weird_whitespace(self):
1262        soup = self.soup("<a class='foo\tbar\nbaz'>")
1263        self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1264
1265    def test_attributes_joined_into_string_on_output(self):
1266        soup = self.soup("<a class='foo\tbar'>")
1267        self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1268
1269    def test_accept_charset(self):
1270        soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1271        self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1272
1273    def test_cdata_attribute_applying_only_to_one_tag(self):
1274        data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1275        soup = self.soup(data)
1276        # We saw in another test that accept-charset is a cdata-list
1277        # attribute for the <form> tag. But it's not a cdata-list
1278        # attribute for any other tag.
1279        self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1280
1281    def test_string_has_immutable_name_property(self):
1282        string = self.soup("s").string
1283        self.assertEqual(None, string.name)
1284        def t():
1285            string.name = 'foo'
1286        self.assertRaises(AttributeError, t)
1287
1288class TestPersistence(SoupTest):
1289    "Testing features like pickle and deepcopy."
1290
1291    def setUp(self):
1292        super(TestPersistence, self).setUp()
1293        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1294"http://www.w3.org/TR/REC-html40/transitional.dtd">
1295<html>
1296<head>
1297<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1298<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1299<link rev="made" href="mailto:leonardr@segfault.org">
1300<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1301<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1302<meta name="author" content="Leonard Richardson">
1303</head>
1304<body>
1305<a href="foo">foo</a>
1306<a href="foo"><b>bar</b></a>
1307</body>
1308</html>"""
1309        self.tree = self.soup(self.page)
1310
1311    def test_pickle_and_unpickle_identity(self):
1312        # Pickling a tree, then unpickling it, yields a tree identical
1313        # to the original.
1314        dumped = pickle.dumps(self.tree, 2)
1315        loaded = pickle.loads(dumped)
1316        self.assertEqual(loaded.__class__, BeautifulSoup)
1317        self.assertEqual(loaded.decode(), self.tree.decode())
1318
1319    def test_deepcopy_identity(self):
1320        # Making a deepcopy of a tree yields an identical tree.
1321        copied = copy.deepcopy(self.tree)
1322        self.assertEqual(copied.decode(), self.tree.decode())
1323
1324    def test_unicode_pickle(self):
1325        # A tree containing Unicode characters can be pickled.
1326        html = "<b>\N{SNOWMAN}</b>"
1327        soup = self.soup(html)
1328        dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1329        loaded = pickle.loads(dumped)
1330        self.assertEqual(loaded.decode(), soup.decode())
1331
1332    def test_copy_navigablestring_is_not_attached_to_tree(self):
1333        html = "<b>Foo<a></a></b><b>Bar</b>"
1334        soup = self.soup(html)
1335        s1 = soup.find(string="Foo")
1336        s2 = copy.copy(s1)
1337        self.assertEqual(s1, s2)
1338        self.assertEqual(None, s2.parent)
1339        self.assertEqual(None, s2.next_element)
1340        self.assertNotEqual(None, s1.next_sibling)
1341        self.assertEqual(None, s2.next_sibling)
1342        self.assertEqual(None, s2.previous_element)
1343
1344    def test_copy_navigablestring_subclass_has_same_type(self):
1345        html = "<b><!--Foo--></b>"
1346        soup = self.soup(html)
1347        s1 = soup.string
1348        s2 = copy.copy(s1)
1349        self.assertEqual(s1, s2)
1350        self.assertTrue(isinstance(s2, Comment))
1351
1352    def test_copy_entire_soup(self):
1353        html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1354        soup = self.soup(html)
1355        soup_copy = copy.copy(soup)
1356        self.assertEqual(soup, soup_copy)
1357
1358    def test_copy_tag_copies_contents(self):
1359        html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1360        soup = self.soup(html)
1361        div = soup.div
1362        div_copy = copy.copy(div)
1363
1364        # The two tags look the same, and evaluate to equal.
1365        self.assertEqual(str(div), str(div_copy))
1366        self.assertEqual(div, div_copy)
1367
1368        # But they're not the same object.
1369        self.assertFalse(div is div_copy)
1370
1371        # And they don't have the same relation to the parse tree. The
1372        # copy is not associated with a parse tree at all.
1373        self.assertEqual(None, div_copy.parent)
1374        self.assertEqual(None, div_copy.previous_element)
1375        self.assertEqual(None, div_copy.find(string='Bar').next_element)
1376        self.assertNotEqual(None, div.find(string='Bar').next_element)
1377
1378class TestSubstitutions(SoupTest):
1379
1380    def test_default_formatter_is_minimal(self):
1381        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1382        soup = self.soup(markup)
1383        decoded = soup.decode(formatter="minimal")
1384        # The < is converted back into &lt; but the e-with-acute is left alone.
1385        self.assertEqual(
1386            decoded,
1387            self.document_for(
1388                "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1389
1390    def test_formatter_html(self):
1391        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1392        soup = self.soup(markup)
1393        decoded = soup.decode(formatter="html")
1394        self.assertEqual(
1395            decoded,
1396            self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1397
1398    def test_formatter_minimal(self):
1399        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1400        soup = self.soup(markup)
1401        decoded = soup.decode(formatter="minimal")
1402        # The < is converted back into &lt; but the e-with-acute is left alone.
1403        self.assertEqual(
1404            decoded,
1405            self.document_for(
1406                "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1407
1408    def test_formatter_null(self):
1409        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1410        soup = self.soup(markup)
1411        decoded = soup.decode(formatter=None)
1412        # Neither the angle brackets nor the e-with-acute are converted.
1413        # This is not valid HTML, but it's what the user wanted.
1414        self.assertEqual(decoded,
1415                          self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1416
1417    def test_formatter_custom(self):
1418        markup = "<b>&lt;foo&gt;</b><b>bar</b>"
1419        soup = self.soup(markup)
1420        decoded = soup.decode(formatter = lambda x: x.upper())
1421        # Instead of normal entity conversion code, the custom
1422        # callable is called on every string.
1423        self.assertEqual(
1424            decoded,
1425            self.document_for("<b><FOO></b><b>BAR</b>"))
1426
1427    def test_formatter_is_run_on_attribute_values(self):
1428        markup = '<a href="http://a.com?a=b&c=é">e</a>'
1429        soup = self.soup(markup)
1430        a = soup.a
1431
1432        expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1433
1434        self.assertEqual(expect_minimal, a.decode())
1435        self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1436
1437        expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1438        self.assertEqual(expect_html, a.decode(formatter="html"))
1439
1440        self.assertEqual(markup, a.decode(formatter=None))
1441        expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1442        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1443
1444    def test_formatter_skips_script_tag_for_html_documents(self):
1445        doc = """
1446  <script type="text/javascript">
1447   console.log("< < hey > > ");
1448  </script>
1449"""
1450        encoded = BeautifulSoup(doc, 'html.parser').encode()
1451        self.assertTrue(b"< < hey > >" in encoded)
1452
1453    def test_formatter_skips_style_tag_for_html_documents(self):
1454        doc = """
1455  <style type="text/css">
1456   console.log("< < hey > > ");
1457  </style>
1458"""
1459        encoded = BeautifulSoup(doc, 'html.parser').encode()
1460        self.assertTrue(b"< < hey > >" in encoded)
1461
1462    def test_prettify_leaves_preformatted_text_alone(self):
1463        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ")
1464        # Everything outside the <pre> tag is reformatted, but everything
1465        # inside is left alone.
1466        self.assertEqual(
1467            '<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
1468            soup.div.prettify())
1469
1470    def test_prettify_accepts_formatter(self):
1471        soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1472        pretty = soup.prettify(formatter = lambda x: x.upper())
1473        self.assertTrue("FOO" in pretty)
1474
1475    def test_prettify_outputs_unicode_by_default(self):
1476        soup = self.soup("<a></a>")
1477        self.assertEqual(str, type(soup.prettify()))
1478
1479    def test_prettify_can_encode_data(self):
1480        soup = self.soup("<a></a>")
1481        self.assertEqual(bytes, type(soup.prettify("utf-8")))
1482
1483    def test_html_entity_substitution_off_by_default(self):
1484        markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1485        soup = self.soup(markup)
1486        encoded = soup.b.encode("utf-8")
1487        self.assertEqual(encoded, markup.encode('utf-8'))
1488
1489    def test_encoding_substitution(self):
1490        # Here's the <meta> tag saying that a document is
1491        # encoded in Shift-JIS.
1492        meta_tag = ('<meta content="text/html; charset=x-sjis" '
1493                    'http-equiv="Content-type"/>')
1494        soup = self.soup(meta_tag)
1495
1496        # Parse the document, and the charset apprears unchanged.
1497        self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1498
1499        # Encode the document into some encoding, and the encoding is
1500        # substituted into the meta tag.
1501        utf_8 = soup.encode("utf-8")
1502        self.assertTrue(b"charset=utf-8" in utf_8)
1503
1504        euc_jp = soup.encode("euc_jp")
1505        self.assertTrue(b"charset=euc_jp" in euc_jp)
1506
1507        shift_jis = soup.encode("shift-jis")
1508        self.assertTrue(b"charset=shift-jis" in shift_jis)
1509
1510        utf_16_u = soup.encode("utf-16").decode("utf-16")
1511        self.assertTrue("charset=utf-16" in utf_16_u)
1512
1513    def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1514        markup = ('<head><meta content="text/html; charset=x-sjis" '
1515                    'http-equiv="Content-type"/></head><pre>foo</pre>')
1516
1517        # Beautiful Soup used to try to rewrite the meta tag even if the
1518        # meta tag got filtered out by the strainer. This test makes
1519        # sure that doesn't happen.
1520        strainer = SoupStrainer('pre')
1521        soup = self.soup(markup, parse_only=strainer)
1522        self.assertEqual(soup.contents[0].name, 'pre')
1523
1524class TestEncoding(SoupTest):
1525    """Test the ability to encode objects into strings."""
1526
1527    def test_unicode_string_can_be_encoded(self):
1528        html = "<b>\N{SNOWMAN}</b>"
1529        soup = self.soup(html)
1530        self.assertEqual(soup.b.string.encode("utf-8"),
1531                          "\N{SNOWMAN}".encode("utf-8"))
1532
1533    def test_tag_containing_unicode_string_can_be_encoded(self):
1534        html = "<b>\N{SNOWMAN}</b>"
1535        soup = self.soup(html)
1536        self.assertEqual(
1537            soup.b.encode("utf-8"), html.encode("utf-8"))
1538
1539    def test_encoding_substitutes_unrecognized_characters_by_default(self):
1540        html = "<b>\N{SNOWMAN}</b>"
1541        soup = self.soup(html)
1542        self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1543
1544    def test_encoding_can_be_made_strict(self):
1545        html = "<b>\N{SNOWMAN}</b>"
1546        soup = self.soup(html)
1547        self.assertRaises(
1548            UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1549
1550    def test_decode_contents(self):
1551        html = "<b>\N{SNOWMAN}</b>"
1552        soup = self.soup(html)
1553        self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1554
1555    def test_encode_contents(self):
1556        html = "<b>\N{SNOWMAN}</b>"
1557        soup = self.soup(html)
1558        self.assertEqual(
1559            "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1560                encoding="utf8"))
1561
1562    def test_deprecated_renderContents(self):
1563        html = "<b>\N{SNOWMAN}</b>"
1564        soup = self.soup(html)
1565        self.assertEqual(
1566            "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1567
1568    def test_repr(self):
1569        html = "<b>\N{SNOWMAN}</b>"
1570        soup = self.soup(html)
1571        if PY3K:
1572            self.assertEqual(html, repr(soup))
1573        else:
1574            self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1575
1576class TestNavigableStringSubclasses(SoupTest):
1577
1578    def test_cdata(self):
1579        # None of the current builders turn CDATA sections into CData
1580        # objects, but you can create them manually.
1581        soup = self.soup("")
1582        cdata = CData("foo")
1583        soup.insert(1, cdata)
1584        self.assertEqual(str(soup), "<![CDATA[foo]]>")
1585        self.assertEqual(soup.find(text="foo"), "foo")
1586        self.assertEqual(soup.contents[0], "foo")
1587
1588    def test_cdata_is_never_formatted(self):
1589        """Text inside a CData object is passed into the formatter.
1590
1591        But the return value is ignored.
1592        """
1593
1594        self.count = 0
1595        def increment(*args):
1596            self.count += 1
1597            return "BITTER FAILURE"
1598
1599        soup = self.soup("")
1600        cdata = CData("<><><>")
1601        soup.insert(1, cdata)
1602        self.assertEqual(
1603            b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1604        self.assertEqual(1, self.count)
1605
1606    def test_doctype_ends_in_newline(self):
1607        # Unlike other NavigableString subclasses, a DOCTYPE always ends
1608        # in a newline.
1609        doctype = Doctype("foo")
1610        soup = self.soup("")
1611        soup.insert(1, doctype)
1612        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1613
1614    def test_declaration(self):
1615        d = Declaration("foo")
1616        self.assertEqual("<?foo?>", d.output_ready())
1617
1618class TestSoupSelector(TreeTest):
1619
1620    HTML = """
1621<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1622"http://www.w3.org/TR/html4/strict.dtd">
1623<html>
1624<head>
1625<title>The title</title>
1626<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1627</head>
1628<body>
1629<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1630<div id="main" class="fancy">
1631<div id="inner">
1632<h1 id="header1">An H1</h1>
1633<p>Some text</p>
1634<p class="onep" id="p1">Some more text</p>
1635<h2 id="header2">An H2</h2>
1636<p class="class1 class2 class3" id="pmulti">Another</p>
1637<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1638<h2 id="header3">Another H2</h2>
1639<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1640<span class="s1">
1641<a href="#" id="s1a1">span1a1</a>
1642<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1643<span class="span2">
1644<a href="#" id="s2a1">span2a1</a>
1645</span>
1646<span class="span3"></span>
1647<custom-dashed-tag class="dashed" id="dash2"/>
1648<div data-tag="dashedvalue" id="data1"/>
1649</span>
1650</div>
1651<x id="xid">
1652<z id="zida"/>
1653<z id="zidab"/>
1654<z id="zidac"/>
1655</x>
1656<y id="yid">
1657<z id="zidb"/>
1658</y>
1659<p lang="en" id="lang-en">English</p>
1660<p lang="en-gb" id="lang-en-gb">English UK</p>
1661<p lang="en-us" id="lang-en-us">English US</p>
1662<p lang="fr" id="lang-fr">French</p>
1663</div>
1664
1665<div id="footer">
1666</div>
1667"""
1668
1669    def setUp(self):
1670        self.soup = BeautifulSoup(self.HTML, 'html.parser')
1671
1672    def assertSelects(self, selector, expected_ids):
1673        el_ids = [el['id'] for el in self.soup.select(selector)]
1674        el_ids.sort()
1675        expected_ids.sort()
1676        self.assertEqual(expected_ids, el_ids,
1677            "Selector %s, expected [%s], got [%s]" % (
1678                selector, ', '.join(expected_ids), ', '.join(el_ids)
1679            )
1680        )
1681
1682    assertSelect = assertSelects
1683
1684    def assertSelectMultiple(self, *tests):
1685        for selector, expected_ids in tests:
1686            self.assertSelect(selector, expected_ids)
1687
1688    def test_one_tag_one(self):
1689        els = self.soup.select('title')
1690        self.assertEqual(len(els), 1)
1691        self.assertEqual(els[0].name, 'title')
1692        self.assertEqual(els[0].contents, ['The title'])
1693
1694    def test_one_tag_many(self):
1695        els = self.soup.select('div')
1696        self.assertEqual(len(els), 4)
1697        for div in els:
1698            self.assertEqual(div.name, 'div')
1699
1700        el = self.soup.select_one('div')
1701        self.assertEqual('main', el['id'])
1702
1703    def test_select_one_returns_none_if_no_match(self):
1704        match = self.soup.select_one('nonexistenttag')
1705        self.assertEqual(None, match)
1706
1707
1708    def test_tag_in_tag_one(self):
1709        els = self.soup.select('div div')
1710        self.assertSelects('div div', ['inner', 'data1'])
1711
1712    def test_tag_in_tag_many(self):
1713        for selector in ('html div', 'html body div', 'body div'):
1714            self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
1715
1716    def test_tag_no_match(self):
1717        self.assertEqual(len(self.soup.select('del')), 0)
1718
1719    def test_invalid_tag(self):
1720        self.assertRaises(ValueError, self.soup.select, 'tag%t')
1721
1722    def test_select_dashed_tag_ids(self):
1723        self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1724
1725    def test_select_dashed_by_id(self):
1726        dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1727        self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1728        self.assertEqual(dashed[0]['id'], 'dash2')
1729
1730    def test_dashed_tag_text(self):
1731        self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1732
1733    def test_select_dashed_matches_find_all(self):
1734        self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1735
1736    def test_header_tags(self):
1737        self.assertSelectMultiple(
1738            ('h1', ['header1']),
1739            ('h2', ['header2', 'header3']),
1740        )
1741
1742    def test_class_one(self):
1743        for selector in ('.onep', 'p.onep', 'html p.onep'):
1744            els = self.soup.select(selector)
1745            self.assertEqual(len(els), 1)
1746            self.assertEqual(els[0].name, 'p')
1747            self.assertEqual(els[0]['class'], ['onep'])
1748
1749    def test_class_mismatched_tag(self):
1750        els = self.soup.select('div.onep')
1751        self.assertEqual(len(els), 0)
1752
1753    def test_one_id(self):
1754        for selector in ('div#inner', '#inner', 'div div#inner'):
1755            self.assertSelects(selector, ['inner'])
1756
1757    def test_bad_id(self):
1758        els = self.soup.select('#doesnotexist')
1759        self.assertEqual(len(els), 0)
1760
1761    def test_items_in_id(self):
1762        els = self.soup.select('div#inner p')
1763        self.assertEqual(len(els), 3)
1764        for el in els:
1765            self.assertEqual(el.name, 'p')
1766        self.assertEqual(els[1]['class'], ['onep'])
1767        self.assertFalse(els[0].has_attr('class'))
1768
1769    def test_a_bunch_of_emptys(self):
1770        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1771            self.assertEqual(len(self.soup.select(selector)), 0)
1772
1773    def test_multi_class_support(self):
1774        for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1775            '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1776            self.assertSelects(selector, ['pmulti'])
1777
1778    def test_multi_class_selection(self):
1779        for selector in ('.class1.class3', '.class3.class2',
1780                         '.class1.class2.class3'):
1781            self.assertSelects(selector, ['pmulti'])
1782
1783    def test_child_selector(self):
1784        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1785        self.assertSelects('.s1 > a span', ['s1a2s1'])
1786
1787    def test_child_selector_id(self):
1788        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1789
1790    def test_attribute_equals(self):
1791        self.assertSelectMultiple(
1792            ('p[class="onep"]', ['p1']),
1793            ('p[id="p1"]', ['p1']),
1794            ('[class="onep"]', ['p1']),
1795            ('[id="p1"]', ['p1']),
1796            ('link[rel="stylesheet"]', ['l1']),
1797            ('link[type="text/css"]', ['l1']),
1798            ('link[href="blah.css"]', ['l1']),
1799            ('link[href="no-blah.css"]', []),
1800            ('[rel="stylesheet"]', ['l1']),
1801            ('[type="text/css"]', ['l1']),
1802            ('[href="blah.css"]', ['l1']),
1803            ('[href="no-blah.css"]', []),
1804            ('p[href="no-blah.css"]', []),
1805            ('[href="no-blah.css"]', []),
1806        )
1807
1808    def test_attribute_tilde(self):
1809        self.assertSelectMultiple(
1810            ('p[class~="class1"]', ['pmulti']),
1811            ('p[class~="class2"]', ['pmulti']),
1812            ('p[class~="class3"]', ['pmulti']),
1813            ('[class~="class1"]', ['pmulti']),
1814            ('[class~="class2"]', ['pmulti']),
1815            ('[class~="class3"]', ['pmulti']),
1816            ('a[rel~="friend"]', ['bob']),
1817            ('a[rel~="met"]', ['bob']),
1818            ('[rel~="friend"]', ['bob']),
1819            ('[rel~="met"]', ['bob']),
1820        )
1821
1822    def test_attribute_startswith(self):
1823        self.assertSelectMultiple(
1824            ('[rel^="style"]', ['l1']),
1825            ('link[rel^="style"]', ['l1']),
1826            ('notlink[rel^="notstyle"]', []),
1827            ('[rel^="notstyle"]', []),
1828            ('link[rel^="notstyle"]', []),
1829            ('link[href^="bla"]', ['l1']),
1830            ('a[href^="http://"]', ['bob', 'me']),
1831            ('[href^="http://"]', ['bob', 'me']),
1832            ('[id^="p"]', ['pmulti', 'p1']),
1833            ('[id^="m"]', ['me', 'main']),
1834            ('div[id^="m"]', ['main']),
1835            ('a[id^="m"]', ['me']),
1836            ('div[data-tag^="dashed"]', ['data1'])
1837        )
1838
1839    def test_attribute_endswith(self):
1840        self.assertSelectMultiple(
1841            ('[href$=".css"]', ['l1']),
1842            ('link[href$=".css"]', ['l1']),
1843            ('link[id$="1"]', ['l1']),
1844            ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
1845            ('div[id$="1"]', ['data1']),
1846            ('[id$="noending"]', []),
1847        )
1848
1849    def test_attribute_contains(self):
1850        self.assertSelectMultiple(
1851            # From test_attribute_startswith
1852            ('[rel*="style"]', ['l1']),
1853            ('link[rel*="style"]', ['l1']),
1854            ('notlink[rel*="notstyle"]', []),
1855            ('[rel*="notstyle"]', []),
1856            ('link[rel*="notstyle"]', []),
1857            ('link[href*="bla"]', ['l1']),
1858            ('[href*="http://"]', ['bob', 'me']),
1859            ('[id*="p"]', ['pmulti', 'p1']),
1860            ('div[id*="m"]', ['main']),
1861            ('a[id*="m"]', ['me']),
1862            # From test_attribute_endswith
1863            ('[href*=".css"]', ['l1']),
1864            ('link[href*=".css"]', ['l1']),
1865            ('link[id*="1"]', ['l1']),
1866            ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
1867            ('div[id*="1"]', ['data1']),
1868            ('[id*="noending"]', []),
1869            # New for this test
1870            ('[href*="."]', ['bob', 'me', 'l1']),
1871            ('a[href*="."]', ['bob', 'me']),
1872            ('link[href*="."]', ['l1']),
1873            ('div[id*="n"]', ['main', 'inner']),
1874            ('div[id*="nn"]', ['inner']),
1875            ('div[data-tag*="edval"]', ['data1'])
1876        )
1877
1878    def test_attribute_exact_or_hypen(self):
1879        self.assertSelectMultiple(
1880            ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1881            ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1882            ('p[lang|="fr"]', ['lang-fr']),
1883            ('p[lang|="gb"]', []),
1884        )
1885
1886    def test_attribute_exists(self):
1887        self.assertSelectMultiple(
1888            ('[rel]', ['l1', 'bob', 'me']),
1889            ('link[rel]', ['l1']),
1890            ('a[rel]', ['bob', 'me']),
1891            ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1892            ('p[class]', ['p1', 'pmulti']),
1893            ('[blah]', []),
1894            ('p[blah]', []),
1895            ('div[data-tag]', ['data1'])
1896        )
1897
1898    def test_unsupported_pseudoclass(self):
1899        self.assertRaises(
1900            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
1901
1902        self.assertRaises(
1903            NotImplementedError, self.soup.select, "a:nth-of-type(a)")
1904
1905
1906    def test_nth_of_type(self):
1907        # Try to select first paragraph
1908        els = self.soup.select('div#inner p:nth-of-type(1)')
1909        self.assertEqual(len(els), 1)
1910        self.assertEqual(els[0].string, 'Some text')
1911
1912        # Try to select third paragraph
1913        els = self.soup.select('div#inner p:nth-of-type(3)')
1914        self.assertEqual(len(els), 1)
1915        self.assertEqual(els[0].string, 'Another')
1916
1917        # Try to select (non-existent!) fourth paragraph
1918        els = self.soup.select('div#inner p:nth-of-type(4)')
1919        self.assertEqual(len(els), 0)
1920
1921        # Pass in an invalid value.
1922        self.assertRaises(
1923            ValueError, self.soup.select, 'div p:nth-of-type(0)')
1924
1925    def test_nth_of_type_direct_descendant(self):
1926        els = self.soup.select('div#inner > p:nth-of-type(1)')
1927        self.assertEqual(len(els), 1)
1928        self.assertEqual(els[0].string, 'Some text')
1929
1930    def test_id_child_selector_nth_of_type(self):
1931        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1932
1933    def test_select_on_element(self):
1934        # Other tests operate on the tree; this operates on an element
1935        # within the tree.
1936        inner = self.soup.find("div", id="main")
1937        selected = inner.select("div")
1938        # The <div id="inner"> tag was selected. The <div id="footer">
1939        # tag was not.
1940        self.assertSelectsIDs(selected, ['inner', 'data1'])
1941
1942    def test_overspecified_child_id(self):
1943        self.assertSelects(".fancy #inner", ['inner'])
1944        self.assertSelects(".normal #inner", [])
1945
1946    def test_adjacent_sibling_selector(self):
1947        self.assertSelects('#p1 + h2', ['header2'])
1948        self.assertSelects('#p1 + h2 + p', ['pmulti'])
1949        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1950        self.assertEqual([], self.soup.select('#p1 + p'))
1951
1952    def test_general_sibling_selector(self):
1953        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1954        self.assertSelects('#p1 ~ #header2', ['header2'])
1955        self.assertSelects('#p1 ~ h2 + a', ['me'])
1956        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1957        self.assertEqual([], self.soup.select('#inner ~ h2'))
1958
1959    def test_dangling_combinator(self):
1960        self.assertRaises(ValueError, self.soup.select, 'h1 >')
1961
1962    def test_sibling_combinator_wont_select_same_tag_twice(self):
1963        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
1964
1965    # Test the selector grouping operator (the comma)
1966    def test_multiple_select(self):
1967        self.assertSelects('x, y', ['xid', 'yid'])
1968
1969    def test_multiple_select_with_no_space(self):
1970        self.assertSelects('x,y', ['xid', 'yid'])
1971
1972    def test_multiple_select_with_more_space(self):
1973        self.assertSelects('x,    y', ['xid', 'yid'])
1974
1975    def test_multiple_select_duplicated(self):
1976        self.assertSelects('x, x', ['xid'])
1977
1978    def test_multiple_select_sibling(self):
1979        self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
1980
1981    def test_multiple_select_tag_and_direct_descendant(self):
1982        self.assertSelects('x, y > z', ['xid', 'zidb'])
1983
1984    def test_multiple_select_direct_descendant_and_tags(self):
1985        self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1986
1987    def test_multiple_select_indirect_descendant(self):
1988        self.assertSelects('div x,y,  z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1989
1990    def test_invalid_multiple_select(self):
1991        self.assertRaises(ValueError, self.soup.select, ',x, y')
1992        self.assertRaises(ValueError, self.soup.select, 'x,,y')
1993
1994    def test_multiple_select_attrs(self):
1995        self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
1996
1997    def test_multiple_select_ids(self):
1998        self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
1999
2000    def test_multiple_select_nested(self):
2001        self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2002
2003
2004
2005