1# -*- coding: utf-8 -*- 2"""Tests for Beautiful Soup's tree traversal methods. 3 4The tree traversal methods are the main advantage of using Beautiful 5Soup over just using a parser. 6 7Different parsers will build different Beautiful Soup trees given the 8same markup, but all Beautiful Soup trees can be traversed with the 9methods tested here. 10""" 11 12import copy 13import pickle 14import re 15import warnings 16from bs4 import BeautifulSoup 17from bs4.builder import builder_registry 18from bs4.element import ( 19 PY3K, 20 CData, 21 Comment, 22 Declaration, 23 Doctype, 24 NavigableString, 25 SoupStrainer, 26 Tag, 27) 28from bs4.testing import SoupTest 29 30XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) 31LXML_PRESENT = (builder_registry.lookup("lxml") is not None) 32 33class TreeTest(SoupTest): 34 35 def assertSelects(self, tags, should_match): 36 """Make sure that the given tags have the correct text. 37 38 This is used in tests that define a bunch of tags, each 39 containing a single string, and then select certain strings by 40 some mechanism. 41 """ 42 self.assertEqual([tag.string for tag in tags], should_match) 43 44 def assertSelectsIDs(self, tags, should_match): 45 """Make sure that the given tags have the correct IDs. 46 47 This is used in tests that define a bunch of tags, each 48 containing a single string, and then select certain strings by 49 some mechanism. 50 """ 51 self.assertEqual([tag['id'] for tag in tags], should_match) 52 53 54class TestFind(TreeTest): 55 """Basic tests of the find() method. 56 57 find() just calls find_all() with limit=1, so it's not tested all 58 that thouroughly here. 59 """ 60 61 def test_find_tag(self): 62 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") 63 self.assertEqual(soup.find("b").string, "2") 64 65 def test_unicode_text_find(self): 66 soup = self.soup('<h1>Räksmörgås</h1>') 67 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') 68 69 def test_unicode_attribute_find(self): 70 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') 71 str(soup) 72 self.assertEqual("here it is", soup.find(id='Räksmörgås').text) 73 74 75 def test_find_everything(self): 76 """Test an optimization that finds all tags.""" 77 soup = self.soup("<a>foo</a><b>bar</b>") 78 self.assertEqual(2, len(soup.find_all())) 79 80 def test_find_everything_with_name(self): 81 """Test an optimization that finds all tags with a given name.""" 82 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") 83 self.assertEqual(2, len(soup.find_all('a'))) 84 85class TestFindAll(TreeTest): 86 """Basic tests of the find_all() method.""" 87 88 def test_find_all_text_nodes(self): 89 """You can search the tree for text nodes.""" 90 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") 91 # Exact match. 92 self.assertEqual(soup.find_all(string="bar"), ["bar"]) 93 self.assertEqual(soup.find_all(text="bar"), ["bar"]) 94 # Match any of a number of strings. 95 self.assertEqual( 96 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) 97 # Match a regular expression. 98 self.assertEqual(soup.find_all(text=re.compile('.*')), 99 ["Foo", "bar", '\xbb']) 100 # Match anything. 101 self.assertEqual(soup.find_all(text=True), 102 ["Foo", "bar", '\xbb']) 103 104 def test_find_all_limit(self): 105 """You can limit the number of items returned by find_all.""" 106 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") 107 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) 108 self.assertSelects(soup.find_all('a', limit=1), ["1"]) 109 self.assertSelects( 110 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) 111 112 # A limit of 0 means no limit. 113 self.assertSelects( 114 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) 115 116 def test_calling_a_tag_is_calling_findall(self): 117 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") 118 self.assertSelects(soup('a', limit=1), ["1"]) 119 self.assertSelects(soup.b(id="foo"), ["3"]) 120 121 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): 122 soup = self.soup("<a></a>") 123 # Create a self-referential list. 124 l = [] 125 l.append(l) 126 127 # Without special code in _normalize_search_value, this would cause infinite 128 # recursion. 129 self.assertEqual([], soup.find_all(l)) 130 131 def test_find_all_resultset(self): 132 """All find_all calls return a ResultSet""" 133 soup = self.soup("<a></a>") 134 result = soup.find_all("a") 135 self.assertTrue(hasattr(result, "source")) 136 137 result = soup.find_all(True) 138 self.assertTrue(hasattr(result, "source")) 139 140 result = soup.find_all(text="foo") 141 self.assertTrue(hasattr(result, "source")) 142 143 144class TestFindAllBasicNamespaces(TreeTest): 145 146 def test_find_by_namespaced_name(self): 147 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') 148 self.assertEqual("4", soup.find("mathml:msqrt").string) 149 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) 150 151 152class TestFindAllByName(TreeTest): 153 """Test ways of finding tags by tag name.""" 154 155 def setUp(self): 156 super(TreeTest, self).setUp() 157 self.tree = self.soup("""<a>First tag.</a> 158 <b>Second tag.</b> 159 <c>Third <a>Nested tag.</a> tag.</c>""") 160 161 def test_find_all_by_tag_name(self): 162 # Find all the <a> tags. 163 self.assertSelects( 164 self.tree.find_all('a'), ['First tag.', 'Nested tag.']) 165 166 def test_find_all_by_name_and_text(self): 167 self.assertSelects( 168 self.tree.find_all('a', text='First tag.'), ['First tag.']) 169 170 self.assertSelects( 171 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) 172 173 self.assertSelects( 174 self.tree.find_all('a', text=re.compile("tag")), 175 ['First tag.', 'Nested tag.']) 176 177 178 def test_find_all_on_non_root_element(self): 179 # You can call find_all on any node, not just the root. 180 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) 181 182 def test_calling_element_invokes_find_all(self): 183 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) 184 185 def test_find_all_by_tag_strainer(self): 186 self.assertSelects( 187 self.tree.find_all(SoupStrainer('a')), 188 ['First tag.', 'Nested tag.']) 189 190 def test_find_all_by_tag_names(self): 191 self.assertSelects( 192 self.tree.find_all(['a', 'b']), 193 ['First tag.', 'Second tag.', 'Nested tag.']) 194 195 def test_find_all_by_tag_dict(self): 196 self.assertSelects( 197 self.tree.find_all({'a' : True, 'b' : True}), 198 ['First tag.', 'Second tag.', 'Nested tag.']) 199 200 def test_find_all_by_tag_re(self): 201 self.assertSelects( 202 self.tree.find_all(re.compile('^[ab]$')), 203 ['First tag.', 'Second tag.', 'Nested tag.']) 204 205 def test_find_all_with_tags_matching_method(self): 206 # You can define an oracle method that determines whether 207 # a tag matches the search. 208 def id_matches_name(tag): 209 return tag.name == tag.get('id') 210 211 tree = self.soup("""<a id="a">Match 1.</a> 212 <a id="1">Does not match.</a> 213 <b id="b">Match 2.</a>""") 214 215 self.assertSelects( 216 tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) 217 218 219class TestFindAllByAttribute(TreeTest): 220 221 def test_find_all_by_attribute_name(self): 222 # You can pass in keyword arguments to find_all to search by 223 # attribute. 224 tree = self.soup(""" 225 <a id="first">Matching a.</a> 226 <a id="second"> 227 Non-matching <b id="first">Matching b.</b>a. 228 </a>""") 229 self.assertSelects(tree.find_all(id='first'), 230 ["Matching a.", "Matching b."]) 231 232 def test_find_all_by_utf8_attribute_value(self): 233 peace = "םולש".encode("utf8") 234 data = '<a title="םולש"></a>'.encode("utf8") 235 soup = self.soup(data) 236 self.assertEqual([soup.a], soup.find_all(title=peace)) 237 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) 238 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) 239 240 def test_find_all_by_attribute_dict(self): 241 # You can pass in a dictionary as the argument 'attrs'. This 242 # lets you search for attributes like 'name' (a fixed argument 243 # to find_all) and 'class' (a reserved word in Python.) 244 tree = self.soup(""" 245 <a name="name1" class="class1">Name match.</a> 246 <a name="name2" class="class2">Class match.</a> 247 <a name="name3" class="class3">Non-match.</a> 248 <name1>A tag called 'name1'.</name1> 249 """) 250 251 # This doesn't do what you want. 252 self.assertSelects(tree.find_all(name='name1'), 253 ["A tag called 'name1'."]) 254 # This does what you want. 255 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), 256 ["Name match."]) 257 258 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), 259 ["Class match."]) 260 261 def test_find_all_by_class(self): 262 tree = self.soup(""" 263 <a class="1">Class 1.</a> 264 <a class="2">Class 2.</a> 265 <b class="1">Class 1.</b> 266 <c class="3 4">Class 3 and 4.</c> 267 """) 268 269 # Passing in the class_ keyword argument will search against 270 # the 'class' attribute. 271 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) 272 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) 273 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) 274 275 # Passing in a string to 'attrs' will also search the CSS class. 276 self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) 277 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) 278 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) 279 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) 280 281 def test_find_by_class_when_multiple_classes_present(self): 282 tree = self.soup("<gar class='foo bar'>Found it</gar>") 283 284 f = tree.find_all("gar", class_=re.compile("o")) 285 self.assertSelects(f, ["Found it"]) 286 287 f = tree.find_all("gar", class_=re.compile("a")) 288 self.assertSelects(f, ["Found it"]) 289 290 # Since the class is not the string "foo bar", but the two 291 # strings "foo" and "bar", this will not find anything. 292 f = tree.find_all("gar", class_=re.compile("o b")) 293 self.assertSelects(f, []) 294 295 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): 296 soup = self.soup("<a class='bar'>Found it</a>") 297 298 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) 299 300 def big_attribute_value(value): 301 return len(value) > 3 302 303 self.assertSelects(soup.find_all("a", big_attribute_value), []) 304 305 def small_attribute_value(value): 306 return len(value) <= 3 307 308 self.assertSelects( 309 soup.find_all("a", small_attribute_value), ["Found it"]) 310 311 def test_find_all_with_string_for_attrs_finds_multiple_classes(self): 312 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') 313 a, a2 = soup.find_all("a") 314 self.assertEqual([a, a2], soup.find_all("a", "foo")) 315 self.assertEqual([a], soup.find_all("a", "bar")) 316 317 # If you specify the class as a string that contains a 318 # space, only that specific value will be found. 319 self.assertEqual([a], soup.find_all("a", class_="foo bar")) 320 self.assertEqual([a], soup.find_all("a", "foo bar")) 321 self.assertEqual([], soup.find_all("a", "bar foo")) 322 323 def test_find_all_by_attribute_soupstrainer(self): 324 tree = self.soup(""" 325 <a id="first">Match.</a> 326 <a id="second">Non-match.</a>""") 327 328 strainer = SoupStrainer(attrs={'id' : 'first'}) 329 self.assertSelects(tree.find_all(strainer), ['Match.']) 330 331 def test_find_all_with_missing_atribute(self): 332 # You can pass in None as the value of an attribute to find_all. 333 # This will match tags that do not have that attribute set. 334 tree = self.soup("""<a id="1">ID present.</a> 335 <a>No ID present.</a> 336 <a id="">ID is empty.</a>""") 337 self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) 338 339 def test_find_all_with_defined_attribute(self): 340 # You can pass in None as the value of an attribute to find_all. 341 # This will match tags that have that attribute set to any value. 342 tree = self.soup("""<a id="1">ID present.</a> 343 <a>No ID present.</a> 344 <a id="">ID is empty.</a>""") 345 self.assertSelects( 346 tree.find_all(id=True), ["ID present.", "ID is empty."]) 347 348 def test_find_all_with_numeric_attribute(self): 349 # If you search for a number, it's treated as a string. 350 tree = self.soup("""<a id=1>Unquoted attribute.</a> 351 <a id="1">Quoted attribute.</a>""") 352 353 expected = ["Unquoted attribute.", "Quoted attribute."] 354 self.assertSelects(tree.find_all(id=1), expected) 355 self.assertSelects(tree.find_all(id="1"), expected) 356 357 def test_find_all_with_list_attribute_values(self): 358 # You can pass a list of attribute values instead of just one, 359 # and you'll get tags that match any of the values. 360 tree = self.soup("""<a id="1">1</a> 361 <a id="2">2</a> 362 <a id="3">3</a> 363 <a>No ID.</a>""") 364 self.assertSelects(tree.find_all(id=["1", "3", "4"]), 365 ["1", "3"]) 366 367 def test_find_all_with_regular_expression_attribute_value(self): 368 # You can pass a regular expression as an attribute value, and 369 # you'll get tags whose values for that attribute match the 370 # regular expression. 371 tree = self.soup("""<a id="a">One a.</a> 372 <a id="aa">Two as.</a> 373 <a id="ab">Mixed as and bs.</a> 374 <a id="b">One b.</a> 375 <a>No ID.</a>""") 376 377 self.assertSelects(tree.find_all(id=re.compile("^a+$")), 378 ["One a.", "Two as."]) 379 380 def test_find_by_name_and_containing_string(self): 381 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") 382 a = soup.a 383 384 self.assertEqual([a], soup.find_all("a", text="foo")) 385 self.assertEqual([], soup.find_all("a", text="bar")) 386 self.assertEqual([], soup.find_all("a", text="bar")) 387 388 def test_find_by_name_and_containing_string_when_string_is_buried(self): 389 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") 390 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) 391 392 def test_find_by_attribute_and_containing_string(self): 393 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') 394 a = soup.a 395 396 self.assertEqual([a], soup.find_all(id=2, text="foo")) 397 self.assertEqual([], soup.find_all(id=1, text="bar")) 398 399 400 401 402class TestIndex(TreeTest): 403 """Test Tag.index""" 404 def test_index(self): 405 tree = self.soup("""<div> 406 <a>Identical</a> 407 <b>Not identical</b> 408 <a>Identical</a> 409 410 <c><d>Identical with child</d></c> 411 <b>Also not identical</b> 412 <c><d>Identical with child</d></c> 413 </div>""") 414 div = tree.div 415 for i, element in enumerate(div.contents): 416 self.assertEqual(i, div.index(element)) 417 self.assertRaises(ValueError, tree.index, 1) 418 419 420class TestParentOperations(TreeTest): 421 """Test navigation and searching through an element's parents.""" 422 423 def setUp(self): 424 super(TestParentOperations, self).setUp() 425 self.tree = self.soup('''<ul id="empty"></ul> 426 <ul id="top"> 427 <ul id="middle"> 428 <ul id="bottom"> 429 <b>Start here</b> 430 </ul> 431 </ul>''') 432 self.start = self.tree.b 433 434 435 def test_parent(self): 436 self.assertEqual(self.start.parent['id'], 'bottom') 437 self.assertEqual(self.start.parent.parent['id'], 'middle') 438 self.assertEqual(self.start.parent.parent.parent['id'], 'top') 439 440 def test_parent_of_top_tag_is_soup_object(self): 441 top_tag = self.tree.contents[0] 442 self.assertEqual(top_tag.parent, self.tree) 443 444 def test_soup_object_has_no_parent(self): 445 self.assertEqual(None, self.tree.parent) 446 447 def test_find_parents(self): 448 self.assertSelectsIDs( 449 self.start.find_parents('ul'), ['bottom', 'middle', 'top']) 450 self.assertSelectsIDs( 451 self.start.find_parents('ul', id="middle"), ['middle']) 452 453 def test_find_parent(self): 454 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') 455 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') 456 457 def test_parent_of_text_element(self): 458 text = self.tree.find(text="Start here") 459 self.assertEqual(text.parent.name, 'b') 460 461 def test_text_element_find_parent(self): 462 text = self.tree.find(text="Start here") 463 self.assertEqual(text.find_parent('ul')['id'], 'bottom') 464 465 def test_parent_generator(self): 466 parents = [parent['id'] for parent in self.start.parents 467 if parent is not None and 'id' in parent.attrs] 468 self.assertEqual(parents, ['bottom', 'middle', 'top']) 469 470 471class ProximityTest(TreeTest): 472 473 def setUp(self): 474 super(TreeTest, self).setUp() 475 self.tree = self.soup( 476 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') 477 478 479class TestNextOperations(ProximityTest): 480 481 def setUp(self): 482 super(TestNextOperations, self).setUp() 483 self.start = self.tree.b 484 485 def test_next(self): 486 self.assertEqual(self.start.next_element, "One") 487 self.assertEqual(self.start.next_element.next_element['id'], "2") 488 489 def test_next_of_last_item_is_none(self): 490 last = self.tree.find(text="Three") 491 self.assertEqual(last.next_element, None) 492 493 def test_next_of_root_is_none(self): 494 # The document root is outside the next/previous chain. 495 self.assertEqual(self.tree.next_element, None) 496 497 def test_find_all_next(self): 498 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) 499 self.start.find_all_next(id=3) 500 self.assertSelects(self.start.find_all_next(id=3), ["Three"]) 501 502 def test_find_next(self): 503 self.assertEqual(self.start.find_next('b')['id'], '2') 504 self.assertEqual(self.start.find_next(text="Three"), "Three") 505 506 def test_find_next_for_text_element(self): 507 text = self.tree.find(text="One") 508 self.assertEqual(text.find_next("b").string, "Two") 509 self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) 510 511 def test_next_generator(self): 512 start = self.tree.find(text="Two") 513 successors = [node for node in start.next_elements] 514 # There are two successors: the final <b> tag and its text contents. 515 tag, contents = successors 516 self.assertEqual(tag['id'], '3') 517 self.assertEqual(contents, "Three") 518 519class TestPreviousOperations(ProximityTest): 520 521 def setUp(self): 522 super(TestPreviousOperations, self).setUp() 523 self.end = self.tree.find(text="Three") 524 525 def test_previous(self): 526 self.assertEqual(self.end.previous_element['id'], "3") 527 self.assertEqual(self.end.previous_element.previous_element, "Two") 528 529 def test_previous_of_first_item_is_none(self): 530 first = self.tree.find('html') 531 self.assertEqual(first.previous_element, None) 532 533 def test_previous_of_root_is_none(self): 534 # The document root is outside the next/previous chain. 535 # XXX This is broken! 536 #self.assertEqual(self.tree.previous_element, None) 537 pass 538 539 def test_find_all_previous(self): 540 # The <b> tag containing the "Three" node is the predecessor 541 # of the "Three" node itself, which is why "Three" shows up 542 # here. 543 self.assertSelects( 544 self.end.find_all_previous('b'), ["Three", "Two", "One"]) 545 self.assertSelects(self.end.find_all_previous(id=1), ["One"]) 546 547 def test_find_previous(self): 548 self.assertEqual(self.end.find_previous('b')['id'], '3') 549 self.assertEqual(self.end.find_previous(text="One"), "One") 550 551 def test_find_previous_for_text_element(self): 552 text = self.tree.find(text="Three") 553 self.assertEqual(text.find_previous("b").string, "Three") 554 self.assertSelects( 555 text.find_all_previous("b"), ["Three", "Two", "One"]) 556 557 def test_previous_generator(self): 558 start = self.tree.find(text="One") 559 predecessors = [node for node in start.previous_elements] 560 561 # There are four predecessors: the <b> tag containing "One" 562 # the <body> tag, the <head> tag, and the <html> tag. 563 b, body, head, html = predecessors 564 self.assertEqual(b['id'], '1') 565 self.assertEqual(body.name, "body") 566 self.assertEqual(head.name, "head") 567 self.assertEqual(html.name, "html") 568 569 570class SiblingTest(TreeTest): 571 572 def setUp(self): 573 super(SiblingTest, self).setUp() 574 markup = '''<html> 575 <span id="1"> 576 <span id="1.1"></span> 577 </span> 578 <span id="2"> 579 <span id="2.1"></span> 580 </span> 581 <span id="3"> 582 <span id="3.1"></span> 583 </span> 584 <span id="4"></span> 585 </html>''' 586 # All that whitespace looks good but makes the tests more 587 # difficult. Get rid of it. 588 markup = re.compile(r"\n\s*").sub("", markup) 589 self.tree = self.soup(markup) 590 591 592class TestNextSibling(SiblingTest): 593 594 def setUp(self): 595 super(TestNextSibling, self).setUp() 596 self.start = self.tree.find(id="1") 597 598 def test_next_sibling_of_root_is_none(self): 599 self.assertEqual(self.tree.next_sibling, None) 600 601 def test_next_sibling(self): 602 self.assertEqual(self.start.next_sibling['id'], '2') 603 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') 604 605 # Note the difference between next_sibling and next_element. 606 self.assertEqual(self.start.next_element['id'], '1.1') 607 608 def test_next_sibling_may_not_exist(self): 609 self.assertEqual(self.tree.html.next_sibling, None) 610 611 nested_span = self.tree.find(id="1.1") 612 self.assertEqual(nested_span.next_sibling, None) 613 614 last_span = self.tree.find(id="4") 615 self.assertEqual(last_span.next_sibling, None) 616 617 def test_find_next_sibling(self): 618 self.assertEqual(self.start.find_next_sibling('span')['id'], '2') 619 620 def test_next_siblings(self): 621 self.assertSelectsIDs(self.start.find_next_siblings("span"), 622 ['2', '3', '4']) 623 624 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) 625 626 def test_next_sibling_for_text_element(self): 627 soup = self.soup("Foo<b>bar</b>baz") 628 start = soup.find(text="Foo") 629 self.assertEqual(start.next_sibling.name, 'b') 630 self.assertEqual(start.next_sibling.next_sibling, 'baz') 631 632 self.assertSelects(start.find_next_siblings('b'), ['bar']) 633 self.assertEqual(start.find_next_sibling(text="baz"), "baz") 634 self.assertEqual(start.find_next_sibling(text="nonesuch"), None) 635 636 637class TestPreviousSibling(SiblingTest): 638 639 def setUp(self): 640 super(TestPreviousSibling, self).setUp() 641 self.end = self.tree.find(id="4") 642 643 def test_previous_sibling_of_root_is_none(self): 644 self.assertEqual(self.tree.previous_sibling, None) 645 646 def test_previous_sibling(self): 647 self.assertEqual(self.end.previous_sibling['id'], '3') 648 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') 649 650 # Note the difference between previous_sibling and previous_element. 651 self.assertEqual(self.end.previous_element['id'], '3.1') 652 653 def test_previous_sibling_may_not_exist(self): 654 self.assertEqual(self.tree.html.previous_sibling, None) 655 656 nested_span = self.tree.find(id="1.1") 657 self.assertEqual(nested_span.previous_sibling, None) 658 659 first_span = self.tree.find(id="1") 660 self.assertEqual(first_span.previous_sibling, None) 661 662 def test_find_previous_sibling(self): 663 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') 664 665 def test_previous_siblings(self): 666 self.assertSelectsIDs(self.end.find_previous_siblings("span"), 667 ['3', '2', '1']) 668 669 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) 670 671 def test_previous_sibling_for_text_element(self): 672 soup = self.soup("Foo<b>bar</b>baz") 673 start = soup.find(text="baz") 674 self.assertEqual(start.previous_sibling.name, 'b') 675 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') 676 677 self.assertSelects(start.find_previous_siblings('b'), ['bar']) 678 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") 679 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) 680 681 682class TestTagCreation(SoupTest): 683 """Test the ability to create new tags.""" 684 def test_new_tag(self): 685 soup = self.soup("") 686 new_tag = soup.new_tag("foo", bar="baz") 687 self.assertTrue(isinstance(new_tag, Tag)) 688 self.assertEqual("foo", new_tag.name) 689 self.assertEqual(dict(bar="baz"), new_tag.attrs) 690 self.assertEqual(None, new_tag.parent) 691 692 def test_tag_inherits_self_closing_rules_from_builder(self): 693 if XML_BUILDER_PRESENT: 694 xml_soup = BeautifulSoup("", "lxml-xml") 695 xml_br = xml_soup.new_tag("br") 696 xml_p = xml_soup.new_tag("p") 697 698 # Both the <br> and <p> tag are empty-element, just because 699 # they have no contents. 700 self.assertEqual(b"<br/>", xml_br.encode()) 701 self.assertEqual(b"<p/>", xml_p.encode()) 702 703 html_soup = BeautifulSoup("", "html.parser") 704 html_br = html_soup.new_tag("br") 705 html_p = html_soup.new_tag("p") 706 707 # The HTML builder users HTML's rules about which tags are 708 # empty-element tags, and the new tags reflect these rules. 709 self.assertEqual(b"<br/>", html_br.encode()) 710 self.assertEqual(b"<p></p>", html_p.encode()) 711 712 def test_new_string_creates_navigablestring(self): 713 soup = self.soup("") 714 s = soup.new_string("foo") 715 self.assertEqual("foo", s) 716 self.assertTrue(isinstance(s, NavigableString)) 717 718 def test_new_string_can_create_navigablestring_subclass(self): 719 soup = self.soup("") 720 s = soup.new_string("foo", Comment) 721 self.assertEqual("foo", s) 722 self.assertTrue(isinstance(s, Comment)) 723 724class TestTreeModification(SoupTest): 725 726 def test_attribute_modification(self): 727 soup = self.soup('<a id="1"></a>') 728 soup.a['id'] = 2 729 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) 730 del(soup.a['id']) 731 self.assertEqual(soup.decode(), self.document_for('<a></a>')) 732 soup.a['id2'] = 'foo' 733 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) 734 735 def test_new_tag_creation(self): 736 builder = builder_registry.lookup('html')() 737 soup = self.soup("<body></body>", builder=builder) 738 a = Tag(soup, builder, 'a') 739 ol = Tag(soup, builder, 'ol') 740 a['href'] = 'http://foo.com/' 741 soup.body.insert(0, a) 742 soup.body.insert(1, ol) 743 self.assertEqual( 744 soup.body.encode(), 745 b'<body><a href="http://foo.com/"></a><ol></ol></body>') 746 747 def test_append_to_contents_moves_tag(self): 748 doc = """<p id="1">Don't leave me <b>here</b>.</p> 749 <p id="2">Don\'t leave!</p>""" 750 soup = self.soup(doc) 751 second_para = soup.find(id='2') 752 bold = soup.b 753 754 # Move the <b> tag to the end of the second paragraph. 755 soup.find(id='2').append(soup.b) 756 757 # The <b> tag is now a child of the second paragraph. 758 self.assertEqual(bold.parent, second_para) 759 760 self.assertEqual( 761 soup.decode(), self.document_for( 762 '<p id="1">Don\'t leave me .</p>\n' 763 '<p id="2">Don\'t leave!<b>here</b></p>')) 764 765 def test_replace_with_returns_thing_that_was_replaced(self): 766 text = "<a></a><b><c></c></b>" 767 soup = self.soup(text) 768 a = soup.a 769 new_a = a.replace_with(soup.c) 770 self.assertEqual(a, new_a) 771 772 def test_unwrap_returns_thing_that_was_replaced(self): 773 text = "<a><b></b><c></c></a>" 774 soup = self.soup(text) 775 a = soup.a 776 new_a = a.unwrap() 777 self.assertEqual(a, new_a) 778 779 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): 780 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>") 781 a = soup.a 782 a.extract() 783 self.assertEqual(None, a.parent) 784 self.assertRaises(ValueError, a.unwrap) 785 self.assertRaises(ValueError, a.replace_with, soup.c) 786 787 def test_replace_tag_with_itself(self): 788 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" 789 soup = self.soup(text) 790 c = soup.c 791 soup.c.replace_with(c) 792 self.assertEqual(soup.decode(), self.document_for(text)) 793 794 def test_replace_tag_with_its_parent_raises_exception(self): 795 text = "<a><b></b></a>" 796 soup = self.soup(text) 797 self.assertRaises(ValueError, soup.b.replace_with, soup.a) 798 799 def test_insert_tag_into_itself_raises_exception(self): 800 text = "<a><b></b></a>" 801 soup = self.soup(text) 802 self.assertRaises(ValueError, soup.a.insert, 0, soup.a) 803 804 def test_replace_with_maintains_next_element_throughout(self): 805 soup = self.soup('<p><a>one</a><b>three</b></p>') 806 a = soup.a 807 b = a.contents[0] 808 # Make it so the <a> tag has two text children. 809 a.insert(1, "two") 810 811 # Now replace each one with the empty string. 812 left, right = a.contents 813 left.replaceWith('') 814 right.replaceWith('') 815 816 # The <b> tag is still connected to the tree. 817 self.assertEqual("three", soup.b.string) 818 819 def test_replace_final_node(self): 820 soup = self.soup("<b>Argh!</b>") 821 soup.find(text="Argh!").replace_with("Hooray!") 822 new_text = soup.find(text="Hooray!") 823 b = soup.b 824 self.assertEqual(new_text.previous_element, b) 825 self.assertEqual(new_text.parent, b) 826 self.assertEqual(new_text.previous_element.next_element, new_text) 827 self.assertEqual(new_text.next_element, None) 828 829 def test_consecutive_text_nodes(self): 830 # A builder should never create two consecutive text nodes, 831 # but if you insert one next to another, Beautiful Soup will 832 # handle it correctly. 833 soup = self.soup("<a><b>Argh!</b><c></c></a>") 834 soup.b.insert(1, "Hooray!") 835 836 self.assertEqual( 837 soup.decode(), self.document_for( 838 "<a><b>Argh!Hooray!</b><c></c></a>")) 839 840 new_text = soup.find(text="Hooray!") 841 self.assertEqual(new_text.previous_element, "Argh!") 842 self.assertEqual(new_text.previous_element.next_element, new_text) 843 844 self.assertEqual(new_text.previous_sibling, "Argh!") 845 self.assertEqual(new_text.previous_sibling.next_sibling, new_text) 846 847 self.assertEqual(new_text.next_sibling, None) 848 self.assertEqual(new_text.next_element, soup.c) 849 850 def test_insert_string(self): 851 soup = self.soup("<a></a>") 852 soup.a.insert(0, "bar") 853 soup.a.insert(0, "foo") 854 # The string were added to the tag. 855 self.assertEqual(["foo", "bar"], soup.a.contents) 856 # And they were converted to NavigableStrings. 857 self.assertEqual(soup.a.contents[0].next_element, "bar") 858 859 def test_insert_tag(self): 860 builder = self.default_builder 861 soup = self.soup( 862 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) 863 magic_tag = Tag(soup, builder, 'magictag') 864 magic_tag.insert(0, "the") 865 soup.a.insert(1, magic_tag) 866 867 self.assertEqual( 868 soup.decode(), self.document_for( 869 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) 870 871 # Make sure all the relationships are hooked up correctly. 872 b_tag = soup.b 873 self.assertEqual(b_tag.next_sibling, magic_tag) 874 self.assertEqual(magic_tag.previous_sibling, b_tag) 875 876 find = b_tag.find(text="Find") 877 self.assertEqual(find.next_element, magic_tag) 878 self.assertEqual(magic_tag.previous_element, find) 879 880 c_tag = soup.c 881 self.assertEqual(magic_tag.next_sibling, c_tag) 882 self.assertEqual(c_tag.previous_sibling, magic_tag) 883 884 the = magic_tag.find(text="the") 885 self.assertEqual(the.parent, magic_tag) 886 self.assertEqual(the.next_element, c_tag) 887 self.assertEqual(c_tag.previous_element, the) 888 889 def test_append_child_thats_already_at_the_end(self): 890 data = "<a><b></b></a>" 891 soup = self.soup(data) 892 soup.a.append(soup.b) 893 self.assertEqual(data, soup.decode()) 894 895 def test_move_tag_to_beginning_of_parent(self): 896 data = "<a><b></b><c></c><d></d></a>" 897 soup = self.soup(data) 898 soup.a.insert(0, soup.d) 899 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) 900 901 def test_insert_works_on_empty_element_tag(self): 902 # This is a little strange, since most HTML parsers don't allow 903 # markup like this to come through. But in general, we don't 904 # know what the parser would or wouldn't have allowed, so 905 # I'm letting this succeed for now. 906 soup = self.soup("<br/>") 907 soup.br.insert(1, "Contents") 908 self.assertEqual(str(soup.br), "<br>Contents</br>") 909 910 def test_insert_before(self): 911 soup = self.soup("<a>foo</a><b>bar</b>") 912 soup.b.insert_before("BAZ") 913 soup.a.insert_before("QUUX") 914 self.assertEqual( 915 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) 916 917 soup.a.insert_before(soup.b) 918 self.assertEqual( 919 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) 920 921 def test_insert_after(self): 922 soup = self.soup("<a>foo</a><b>bar</b>") 923 soup.b.insert_after("BAZ") 924 soup.a.insert_after("QUUX") 925 self.assertEqual( 926 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) 927 soup.b.insert_after(soup.a) 928 self.assertEqual( 929 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) 930 931 def test_insert_after_raises_exception_if_after_has_no_meaning(self): 932 soup = self.soup("") 933 tag = soup.new_tag("a") 934 string = soup.new_string("") 935 self.assertRaises(ValueError, string.insert_after, tag) 936 self.assertRaises(NotImplementedError, soup.insert_after, tag) 937 self.assertRaises(ValueError, tag.insert_after, tag) 938 939 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): 940 soup = self.soup("") 941 tag = soup.new_tag("a") 942 string = soup.new_string("") 943 self.assertRaises(ValueError, string.insert_before, tag) 944 self.assertRaises(NotImplementedError, soup.insert_before, tag) 945 self.assertRaises(ValueError, tag.insert_before, tag) 946 947 def test_replace_with(self): 948 soup = self.soup( 949 "<p>There's <b>no</b> business like <b>show</b> business</p>") 950 no, show = soup.find_all('b') 951 show.replace_with(no) 952 self.assertEqual( 953 soup.decode(), 954 self.document_for( 955 "<p>There's business like <b>no</b> business</p>")) 956 957 self.assertEqual(show.parent, None) 958 self.assertEqual(no.parent, soup.p) 959 self.assertEqual(no.next_element, "no") 960 self.assertEqual(no.next_sibling, " business") 961 962 def test_replace_first_child(self): 963 data = "<a><b></b><c></c></a>" 964 soup = self.soup(data) 965 soup.b.replace_with(soup.c) 966 self.assertEqual("<a><c></c></a>", soup.decode()) 967 968 def test_replace_last_child(self): 969 data = "<a><b></b><c></c></a>" 970 soup = self.soup(data) 971 soup.c.replace_with(soup.b) 972 self.assertEqual("<a><b></b></a>", soup.decode()) 973 974 def test_nested_tag_replace_with(self): 975 soup = self.soup( 976 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") 977 978 # Replace the entire <b> tag and its contents ("reserve the 979 # right") with the <f> tag ("refuse"). 980 remove_tag = soup.b 981 move_tag = soup.f 982 remove_tag.replace_with(move_tag) 983 984 self.assertEqual( 985 soup.decode(), self.document_for( 986 "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) 987 988 # The <b> tag is now an orphan. 989 self.assertEqual(remove_tag.parent, None) 990 self.assertEqual(remove_tag.find(text="right").next_element, None) 991 self.assertEqual(remove_tag.previous_element, None) 992 self.assertEqual(remove_tag.next_sibling, None) 993 self.assertEqual(remove_tag.previous_sibling, None) 994 995 # The <f> tag is now connected to the <a> tag. 996 self.assertEqual(move_tag.parent, soup.a) 997 self.assertEqual(move_tag.previous_element, "We") 998 self.assertEqual(move_tag.next_element.next_element, soup.e) 999 self.assertEqual(move_tag.next_sibling, None) 1000 1001 # The gap where the <f> tag used to be has been mended, and 1002 # the word "to" is now connected to the <g> tag. 1003 to_text = soup.find(text="to") 1004 g_tag = soup.g 1005 self.assertEqual(to_text.next_element, g_tag) 1006 self.assertEqual(to_text.next_sibling, g_tag) 1007 self.assertEqual(g_tag.previous_element, to_text) 1008 self.assertEqual(g_tag.previous_sibling, to_text) 1009 1010 def test_unwrap(self): 1011 tree = self.soup(""" 1012 <p>Unneeded <em>formatting</em> is unneeded</p> 1013 """) 1014 tree.em.unwrap() 1015 self.assertEqual(tree.em, None) 1016 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") 1017 1018 def test_wrap(self): 1019 soup = self.soup("I wish I was bold.") 1020 value = soup.string.wrap(soup.new_tag("b")) 1021 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") 1022 self.assertEqual( 1023 soup.decode(), self.document_for("<b>I wish I was bold.</b>")) 1024 1025 def test_wrap_extracts_tag_from_elsewhere(self): 1026 soup = self.soup("<b></b>I wish I was bold.") 1027 soup.b.next_sibling.wrap(soup.b) 1028 self.assertEqual( 1029 soup.decode(), self.document_for("<b>I wish I was bold.</b>")) 1030 1031 def test_wrap_puts_new_contents_at_the_end(self): 1032 soup = self.soup("<b>I like being bold.</b>I wish I was bold.") 1033 soup.b.next_sibling.wrap(soup.b) 1034 self.assertEqual(2, len(soup.b.contents)) 1035 self.assertEqual( 1036 soup.decode(), self.document_for( 1037 "<b>I like being bold.I wish I was bold.</b>")) 1038 1039 def test_extract(self): 1040 soup = self.soup( 1041 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') 1042 1043 self.assertEqual(len(soup.body.contents), 3) 1044 extracted = soup.find(id="nav").extract() 1045 1046 self.assertEqual( 1047 soup.decode(), "<html><body>Some content. More content.</body></html>") 1048 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') 1049 1050 # The extracted tag is now an orphan. 1051 self.assertEqual(len(soup.body.contents), 2) 1052 self.assertEqual(extracted.parent, None) 1053 self.assertEqual(extracted.previous_element, None) 1054 self.assertEqual(extracted.next_element.next_element, None) 1055 1056 # The gap where the extracted tag used to be has been mended. 1057 content_1 = soup.find(text="Some content. ") 1058 content_2 = soup.find(text=" More content.") 1059 self.assertEqual(content_1.next_element, content_2) 1060 self.assertEqual(content_1.next_sibling, content_2) 1061 self.assertEqual(content_2.previous_element, content_1) 1062 self.assertEqual(content_2.previous_sibling, content_1) 1063 1064 def test_extract_distinguishes_between_identical_strings(self): 1065 soup = self.soup("<a>foo</a><b>bar</b>") 1066 foo_1 = soup.a.string 1067 bar_1 = soup.b.string 1068 foo_2 = soup.new_string("foo") 1069 bar_2 = soup.new_string("bar") 1070 soup.a.append(foo_2) 1071 soup.b.append(bar_2) 1072 1073 # Now there are two identical strings in the <a> tag, and two 1074 # in the <b> tag. Let's remove the first "foo" and the second 1075 # "bar". 1076 foo_1.extract() 1077 bar_2.extract() 1078 self.assertEqual(foo_2, soup.a.string) 1079 self.assertEqual(bar_2, soup.b.string) 1080 1081 def test_extract_multiples_of_same_tag(self): 1082 soup = self.soup(""" 1083<html> 1084<head> 1085<script>foo</script> 1086</head> 1087<body> 1088 <script>bar</script> 1089 <a></a> 1090</body> 1091<script>baz</script> 1092</html>""") 1093 [soup.script.extract() for i in soup.find_all("script")] 1094 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) 1095 1096 1097 def test_extract_works_when_element_is_surrounded_by_identical_strings(self): 1098 soup = self.soup( 1099 '<html>\n' 1100 '<body>hi</body>\n' 1101 '</html>') 1102 soup.find('body').extract() 1103 self.assertEqual(None, soup.find('body')) 1104 1105 1106 def test_clear(self): 1107 """Tag.clear()""" 1108 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") 1109 # clear using extract() 1110 a = soup.a 1111 soup.p.clear() 1112 self.assertEqual(len(soup.p.contents), 0) 1113 self.assertTrue(hasattr(a, "contents")) 1114 1115 # clear using decompose() 1116 em = a.em 1117 a.clear(decompose=True) 1118 self.assertEqual(0, len(em.contents)) 1119 1120 def test_string_set(self): 1121 """Tag.string = 'string'""" 1122 soup = self.soup("<a></a> <b><c></c></b>") 1123 soup.a.string = "foo" 1124 self.assertEqual(soup.a.contents, ["foo"]) 1125 soup.b.string = "bar" 1126 self.assertEqual(soup.b.contents, ["bar"]) 1127 1128 def test_string_set_does_not_affect_original_string(self): 1129 soup = self.soup("<a><b>foo</b><c>bar</c>") 1130 soup.b.string = soup.c.string 1131 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") 1132 1133 def test_set_string_preserves_class_of_string(self): 1134 soup = self.soup("<a></a>") 1135 cdata = CData("foo") 1136 soup.a.string = cdata 1137 self.assertTrue(isinstance(soup.a.string, CData)) 1138 1139class TestElementObjects(SoupTest): 1140 """Test various features of element objects.""" 1141 1142 def test_len(self): 1143 """The length of an element is its number of children.""" 1144 soup = self.soup("<top>1<b>2</b>3</top>") 1145 1146 # The BeautifulSoup object itself contains one element: the 1147 # <top> tag. 1148 self.assertEqual(len(soup.contents), 1) 1149 self.assertEqual(len(soup), 1) 1150 1151 # The <top> tag contains three elements: the text node "1", the 1152 # <b> tag, and the text node "3". 1153 self.assertEqual(len(soup.top), 3) 1154 self.assertEqual(len(soup.top.contents), 3) 1155 1156 def test_member_access_invokes_find(self): 1157 """Accessing a Python member .foo invokes find('foo')""" 1158 soup = self.soup('<b><i></i></b>') 1159 self.assertEqual(soup.b, soup.find('b')) 1160 self.assertEqual(soup.b.i, soup.find('b').find('i')) 1161 self.assertEqual(soup.a, None) 1162 1163 def test_deprecated_member_access(self): 1164 soup = self.soup('<b><i></i></b>') 1165 with warnings.catch_warnings(record=True) as w: 1166 tag = soup.bTag 1167 self.assertEqual(soup.b, tag) 1168 self.assertEqual( 1169 '.bTag is deprecated, use .find("b") instead.', 1170 str(w[0].message)) 1171 1172 def test_has_attr(self): 1173 """has_attr() checks for the presence of an attribute. 1174 1175 Please note note: has_attr() is different from 1176 __in__. has_attr() checks the tag's attributes and __in__ 1177 checks the tag's chidlren. 1178 """ 1179 soup = self.soup("<foo attr='bar'>") 1180 self.assertTrue(soup.foo.has_attr('attr')) 1181 self.assertFalse(soup.foo.has_attr('attr2')) 1182 1183 1184 def test_attributes_come_out_in_alphabetical_order(self): 1185 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' 1186 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') 1187 1188 def test_string(self): 1189 # A tag that contains only a text node makes that node 1190 # available as .string. 1191 soup = self.soup("<b>foo</b>") 1192 self.assertEqual(soup.b.string, 'foo') 1193 1194 def test_empty_tag_has_no_string(self): 1195 # A tag with no children has no .stirng. 1196 soup = self.soup("<b></b>") 1197 self.assertEqual(soup.b.string, None) 1198 1199 def test_tag_with_multiple_children_has_no_string(self): 1200 # A tag with no children has no .string. 1201 soup = self.soup("<a>foo<b></b><b></b></b>") 1202 self.assertEqual(soup.b.string, None) 1203 1204 soup = self.soup("<a>foo<b></b>bar</b>") 1205 self.assertEqual(soup.b.string, None) 1206 1207 # Even if all the children are strings, due to trickery, 1208 # it won't work--but this would be a good optimization. 1209 soup = self.soup("<a>foo</b>") 1210 soup.a.insert(1, "bar") 1211 self.assertEqual(soup.a.string, None) 1212 1213 def test_tag_with_recursive_string_has_string(self): 1214 # A tag with a single child which has a .string inherits that 1215 # .string. 1216 soup = self.soup("<a><b>foo</b></a>") 1217 self.assertEqual(soup.a.string, "foo") 1218 self.assertEqual(soup.string, "foo") 1219 1220 def test_lack_of_string(self): 1221 """Only a tag containing a single text node has a .string.""" 1222 soup = self.soup("<b>f<i>e</i>o</b>") 1223 self.assertFalse(soup.b.string) 1224 1225 soup = self.soup("<b></b>") 1226 self.assertFalse(soup.b.string) 1227 1228 def test_all_text(self): 1229 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" 1230 soup = self.soup("<a>a<b>r</b> <r> t </r></a>") 1231 self.assertEqual(soup.a.text, "ar t ") 1232 self.assertEqual(soup.a.get_text(strip=True), "art") 1233 self.assertEqual(soup.a.get_text(","), "a,r, , t ") 1234 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") 1235 1236 def test_get_text_ignores_comments(self): 1237 soup = self.soup("foo<!--IGNORE-->bar") 1238 self.assertEqual(soup.get_text(), "foobar") 1239 1240 self.assertEqual( 1241 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") 1242 self.assertEqual( 1243 soup.get_text(types=None), "fooIGNOREbar") 1244 1245 def test_all_strings_ignores_comments(self): 1246 soup = self.soup("foo<!--IGNORE-->bar") 1247 self.assertEqual(['foo', 'bar'], list(soup.strings)) 1248 1249class TestCDAtaListAttributes(SoupTest): 1250 1251 """Testing cdata-list attributes like 'class'. 1252 """ 1253 def test_single_value_becomes_list(self): 1254 soup = self.soup("<a class='foo'>") 1255 self.assertEqual(["foo"],soup.a['class']) 1256 1257 def test_multiple_values_becomes_list(self): 1258 soup = self.soup("<a class='foo bar'>") 1259 self.assertEqual(["foo", "bar"], soup.a['class']) 1260 1261 def test_multiple_values_separated_by_weird_whitespace(self): 1262 soup = self.soup("<a class='foo\tbar\nbaz'>") 1263 self.assertEqual(["foo", "bar", "baz"],soup.a['class']) 1264 1265 def test_attributes_joined_into_string_on_output(self): 1266 soup = self.soup("<a class='foo\tbar'>") 1267 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) 1268 1269 def test_accept_charset(self): 1270 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') 1271 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) 1272 1273 def test_cdata_attribute_applying_only_to_one_tag(self): 1274 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' 1275 soup = self.soup(data) 1276 # We saw in another test that accept-charset is a cdata-list 1277 # attribute for the <form> tag. But it's not a cdata-list 1278 # attribute for any other tag. 1279 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) 1280 1281 def test_string_has_immutable_name_property(self): 1282 string = self.soup("s").string 1283 self.assertEqual(None, string.name) 1284 def t(): 1285 string.name = 'foo' 1286 self.assertRaises(AttributeError, t) 1287 1288class TestPersistence(SoupTest): 1289 "Testing features like pickle and deepcopy." 1290 1291 def setUp(self): 1292 super(TestPersistence, self).setUp() 1293 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" 1294"http://www.w3.org/TR/REC-html40/transitional.dtd"> 1295<html> 1296<head> 1297<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 1298<title>Beautiful Soup: We called him Tortoise because he taught us.</title> 1299<link rev="made" href="mailto:leonardr@segfault.org"> 1300<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> 1301<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> 1302<meta name="author" content="Leonard Richardson"> 1303</head> 1304<body> 1305<a href="foo">foo</a> 1306<a href="foo"><b>bar</b></a> 1307</body> 1308</html>""" 1309 self.tree = self.soup(self.page) 1310 1311 def test_pickle_and_unpickle_identity(self): 1312 # Pickling a tree, then unpickling it, yields a tree identical 1313 # to the original. 1314 dumped = pickle.dumps(self.tree, 2) 1315 loaded = pickle.loads(dumped) 1316 self.assertEqual(loaded.__class__, BeautifulSoup) 1317 self.assertEqual(loaded.decode(), self.tree.decode()) 1318 1319 def test_deepcopy_identity(self): 1320 # Making a deepcopy of a tree yields an identical tree. 1321 copied = copy.deepcopy(self.tree) 1322 self.assertEqual(copied.decode(), self.tree.decode()) 1323 1324 def test_unicode_pickle(self): 1325 # A tree containing Unicode characters can be pickled. 1326 html = "<b>\N{SNOWMAN}</b>" 1327 soup = self.soup(html) 1328 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) 1329 loaded = pickle.loads(dumped) 1330 self.assertEqual(loaded.decode(), soup.decode()) 1331 1332 def test_copy_navigablestring_is_not_attached_to_tree(self): 1333 html = "<b>Foo<a></a></b><b>Bar</b>" 1334 soup = self.soup(html) 1335 s1 = soup.find(string="Foo") 1336 s2 = copy.copy(s1) 1337 self.assertEqual(s1, s2) 1338 self.assertEqual(None, s2.parent) 1339 self.assertEqual(None, s2.next_element) 1340 self.assertNotEqual(None, s1.next_sibling) 1341 self.assertEqual(None, s2.next_sibling) 1342 self.assertEqual(None, s2.previous_element) 1343 1344 def test_copy_navigablestring_subclass_has_same_type(self): 1345 html = "<b><!--Foo--></b>" 1346 soup = self.soup(html) 1347 s1 = soup.string 1348 s2 = copy.copy(s1) 1349 self.assertEqual(s1, s2) 1350 self.assertTrue(isinstance(s2, Comment)) 1351 1352 def test_copy_entire_soup(self): 1353 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" 1354 soup = self.soup(html) 1355 soup_copy = copy.copy(soup) 1356 self.assertEqual(soup, soup_copy) 1357 1358 def test_copy_tag_copies_contents(self): 1359 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" 1360 soup = self.soup(html) 1361 div = soup.div 1362 div_copy = copy.copy(div) 1363 1364 # The two tags look the same, and evaluate to equal. 1365 self.assertEqual(str(div), str(div_copy)) 1366 self.assertEqual(div, div_copy) 1367 1368 # But they're not the same object. 1369 self.assertFalse(div is div_copy) 1370 1371 # And they don't have the same relation to the parse tree. The 1372 # copy is not associated with a parse tree at all. 1373 self.assertEqual(None, div_copy.parent) 1374 self.assertEqual(None, div_copy.previous_element) 1375 self.assertEqual(None, div_copy.find(string='Bar').next_element) 1376 self.assertNotEqual(None, div.find(string='Bar').next_element) 1377 1378class TestSubstitutions(SoupTest): 1379 1380 def test_default_formatter_is_minimal(self): 1381 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1382 soup = self.soup(markup) 1383 decoded = soup.decode(formatter="minimal") 1384 # The < is converted back into < but the e-with-acute is left alone. 1385 self.assertEqual( 1386 decoded, 1387 self.document_for( 1388 "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1389 1390 def test_formatter_html(self): 1391 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1392 soup = self.soup(markup) 1393 decoded = soup.decode(formatter="html") 1394 self.assertEqual( 1395 decoded, 1396 self.document_for("<b><<Sacré bleu!>></b>")) 1397 1398 def test_formatter_minimal(self): 1399 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1400 soup = self.soup(markup) 1401 decoded = soup.decode(formatter="minimal") 1402 # The < is converted back into < but the e-with-acute is left alone. 1403 self.assertEqual( 1404 decoded, 1405 self.document_for( 1406 "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1407 1408 def test_formatter_null(self): 1409 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1410 soup = self.soup(markup) 1411 decoded = soup.decode(formatter=None) 1412 # Neither the angle brackets nor the e-with-acute are converted. 1413 # This is not valid HTML, but it's what the user wanted. 1414 self.assertEqual(decoded, 1415 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1416 1417 def test_formatter_custom(self): 1418 markup = "<b><foo></b><b>bar</b>" 1419 soup = self.soup(markup) 1420 decoded = soup.decode(formatter = lambda x: x.upper()) 1421 # Instead of normal entity conversion code, the custom 1422 # callable is called on every string. 1423 self.assertEqual( 1424 decoded, 1425 self.document_for("<b><FOO></b><b>BAR</b>")) 1426 1427 def test_formatter_is_run_on_attribute_values(self): 1428 markup = '<a href="http://a.com?a=b&c=é">e</a>' 1429 soup = self.soup(markup) 1430 a = soup.a 1431 1432 expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' 1433 1434 self.assertEqual(expect_minimal, a.decode()) 1435 self.assertEqual(expect_minimal, a.decode(formatter="minimal")) 1436 1437 expect_html = '<a href="http://a.com?a=b&c=é">e</a>' 1438 self.assertEqual(expect_html, a.decode(formatter="html")) 1439 1440 self.assertEqual(markup, a.decode(formatter=None)) 1441 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' 1442 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) 1443 1444 def test_formatter_skips_script_tag_for_html_documents(self): 1445 doc = """ 1446 <script type="text/javascript"> 1447 console.log("< < hey > > "); 1448 </script> 1449""" 1450 encoded = BeautifulSoup(doc, 'html.parser').encode() 1451 self.assertTrue(b"< < hey > >" in encoded) 1452 1453 def test_formatter_skips_style_tag_for_html_documents(self): 1454 doc = """ 1455 <style type="text/css"> 1456 console.log("< < hey > > "); 1457 </style> 1458""" 1459 encoded = BeautifulSoup(doc, 'html.parser').encode() 1460 self.assertTrue(b"< < hey > >" in encoded) 1461 1462 def test_prettify_leaves_preformatted_text_alone(self): 1463 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") 1464 # Everything outside the <pre> tag is reformatted, but everything 1465 # inside is left alone. 1466 self.assertEqual( 1467 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', 1468 soup.div.prettify()) 1469 1470 def test_prettify_accepts_formatter(self): 1471 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') 1472 pretty = soup.prettify(formatter = lambda x: x.upper()) 1473 self.assertTrue("FOO" in pretty) 1474 1475 def test_prettify_outputs_unicode_by_default(self): 1476 soup = self.soup("<a></a>") 1477 self.assertEqual(str, type(soup.prettify())) 1478 1479 def test_prettify_can_encode_data(self): 1480 soup = self.soup("<a></a>") 1481 self.assertEqual(bytes, type(soup.prettify("utf-8"))) 1482 1483 def test_html_entity_substitution_off_by_default(self): 1484 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" 1485 soup = self.soup(markup) 1486 encoded = soup.b.encode("utf-8") 1487 self.assertEqual(encoded, markup.encode('utf-8')) 1488 1489 def test_encoding_substitution(self): 1490 # Here's the <meta> tag saying that a document is 1491 # encoded in Shift-JIS. 1492 meta_tag = ('<meta content="text/html; charset=x-sjis" ' 1493 'http-equiv="Content-type"/>') 1494 soup = self.soup(meta_tag) 1495 1496 # Parse the document, and the charset apprears unchanged. 1497 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') 1498 1499 # Encode the document into some encoding, and the encoding is 1500 # substituted into the meta tag. 1501 utf_8 = soup.encode("utf-8") 1502 self.assertTrue(b"charset=utf-8" in utf_8) 1503 1504 euc_jp = soup.encode("euc_jp") 1505 self.assertTrue(b"charset=euc_jp" in euc_jp) 1506 1507 shift_jis = soup.encode("shift-jis") 1508 self.assertTrue(b"charset=shift-jis" in shift_jis) 1509 1510 utf_16_u = soup.encode("utf-16").decode("utf-16") 1511 self.assertTrue("charset=utf-16" in utf_16_u) 1512 1513 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): 1514 markup = ('<head><meta content="text/html; charset=x-sjis" ' 1515 'http-equiv="Content-type"/></head><pre>foo</pre>') 1516 1517 # Beautiful Soup used to try to rewrite the meta tag even if the 1518 # meta tag got filtered out by the strainer. This test makes 1519 # sure that doesn't happen. 1520 strainer = SoupStrainer('pre') 1521 soup = self.soup(markup, parse_only=strainer) 1522 self.assertEqual(soup.contents[0].name, 'pre') 1523 1524class TestEncoding(SoupTest): 1525 """Test the ability to encode objects into strings.""" 1526 1527 def test_unicode_string_can_be_encoded(self): 1528 html = "<b>\N{SNOWMAN}</b>" 1529 soup = self.soup(html) 1530 self.assertEqual(soup.b.string.encode("utf-8"), 1531 "\N{SNOWMAN}".encode("utf-8")) 1532 1533 def test_tag_containing_unicode_string_can_be_encoded(self): 1534 html = "<b>\N{SNOWMAN}</b>" 1535 soup = self.soup(html) 1536 self.assertEqual( 1537 soup.b.encode("utf-8"), html.encode("utf-8")) 1538 1539 def test_encoding_substitutes_unrecognized_characters_by_default(self): 1540 html = "<b>\N{SNOWMAN}</b>" 1541 soup = self.soup(html) 1542 self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") 1543 1544 def test_encoding_can_be_made_strict(self): 1545 html = "<b>\N{SNOWMAN}</b>" 1546 soup = self.soup(html) 1547 self.assertRaises( 1548 UnicodeEncodeError, soup.encode, "ascii", errors="strict") 1549 1550 def test_decode_contents(self): 1551 html = "<b>\N{SNOWMAN}</b>" 1552 soup = self.soup(html) 1553 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) 1554 1555 def test_encode_contents(self): 1556 html = "<b>\N{SNOWMAN}</b>" 1557 soup = self.soup(html) 1558 self.assertEqual( 1559 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( 1560 encoding="utf8")) 1561 1562 def test_deprecated_renderContents(self): 1563 html = "<b>\N{SNOWMAN}</b>" 1564 soup = self.soup(html) 1565 self.assertEqual( 1566 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) 1567 1568 def test_repr(self): 1569 html = "<b>\N{SNOWMAN}</b>" 1570 soup = self.soup(html) 1571 if PY3K: 1572 self.assertEqual(html, repr(soup)) 1573 else: 1574 self.assertEqual(b'<b>\\u2603</b>', repr(soup)) 1575 1576class TestNavigableStringSubclasses(SoupTest): 1577 1578 def test_cdata(self): 1579 # None of the current builders turn CDATA sections into CData 1580 # objects, but you can create them manually. 1581 soup = self.soup("") 1582 cdata = CData("foo") 1583 soup.insert(1, cdata) 1584 self.assertEqual(str(soup), "<![CDATA[foo]]>") 1585 self.assertEqual(soup.find(text="foo"), "foo") 1586 self.assertEqual(soup.contents[0], "foo") 1587 1588 def test_cdata_is_never_formatted(self): 1589 """Text inside a CData object is passed into the formatter. 1590 1591 But the return value is ignored. 1592 """ 1593 1594 self.count = 0 1595 def increment(*args): 1596 self.count += 1 1597 return "BITTER FAILURE" 1598 1599 soup = self.soup("") 1600 cdata = CData("<><><>") 1601 soup.insert(1, cdata) 1602 self.assertEqual( 1603 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) 1604 self.assertEqual(1, self.count) 1605 1606 def test_doctype_ends_in_newline(self): 1607 # Unlike other NavigableString subclasses, a DOCTYPE always ends 1608 # in a newline. 1609 doctype = Doctype("foo") 1610 soup = self.soup("") 1611 soup.insert(1, doctype) 1612 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") 1613 1614 def test_declaration(self): 1615 d = Declaration("foo") 1616 self.assertEqual("<?foo?>", d.output_ready()) 1617 1618class TestSoupSelector(TreeTest): 1619 1620 HTML = """ 1621<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 1622"http://www.w3.org/TR/html4/strict.dtd"> 1623<html> 1624<head> 1625<title>The title</title> 1626<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> 1627</head> 1628<body> 1629<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> 1630<div id="main" class="fancy"> 1631<div id="inner"> 1632<h1 id="header1">An H1</h1> 1633<p>Some text</p> 1634<p class="onep" id="p1">Some more text</p> 1635<h2 id="header2">An H2</h2> 1636<p class="class1 class2 class3" id="pmulti">Another</p> 1637<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> 1638<h2 id="header3">Another H2</h2> 1639<a id="me" href="http://simonwillison.net/" rel="me">me</a> 1640<span class="s1"> 1641<a href="#" id="s1a1">span1a1</a> 1642<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> 1643<span class="span2"> 1644<a href="#" id="s2a1">span2a1</a> 1645</span> 1646<span class="span3"></span> 1647<custom-dashed-tag class="dashed" id="dash2"/> 1648<div data-tag="dashedvalue" id="data1"/> 1649</span> 1650</div> 1651<x id="xid"> 1652<z id="zida"/> 1653<z id="zidab"/> 1654<z id="zidac"/> 1655</x> 1656<y id="yid"> 1657<z id="zidb"/> 1658</y> 1659<p lang="en" id="lang-en">English</p> 1660<p lang="en-gb" id="lang-en-gb">English UK</p> 1661<p lang="en-us" id="lang-en-us">English US</p> 1662<p lang="fr" id="lang-fr">French</p> 1663</div> 1664 1665<div id="footer"> 1666</div> 1667""" 1668 1669 def setUp(self): 1670 self.soup = BeautifulSoup(self.HTML, 'html.parser') 1671 1672 def assertSelects(self, selector, expected_ids): 1673 el_ids = [el['id'] for el in self.soup.select(selector)] 1674 el_ids.sort() 1675 expected_ids.sort() 1676 self.assertEqual(expected_ids, el_ids, 1677 "Selector %s, expected [%s], got [%s]" % ( 1678 selector, ', '.join(expected_ids), ', '.join(el_ids) 1679 ) 1680 ) 1681 1682 assertSelect = assertSelects 1683 1684 def assertSelectMultiple(self, *tests): 1685 for selector, expected_ids in tests: 1686 self.assertSelect(selector, expected_ids) 1687 1688 def test_one_tag_one(self): 1689 els = self.soup.select('title') 1690 self.assertEqual(len(els), 1) 1691 self.assertEqual(els[0].name, 'title') 1692 self.assertEqual(els[0].contents, ['The title']) 1693 1694 def test_one_tag_many(self): 1695 els = self.soup.select('div') 1696 self.assertEqual(len(els), 4) 1697 for div in els: 1698 self.assertEqual(div.name, 'div') 1699 1700 el = self.soup.select_one('div') 1701 self.assertEqual('main', el['id']) 1702 1703 def test_select_one_returns_none_if_no_match(self): 1704 match = self.soup.select_one('nonexistenttag') 1705 self.assertEqual(None, match) 1706 1707 1708 def test_tag_in_tag_one(self): 1709 els = self.soup.select('div div') 1710 self.assertSelects('div div', ['inner', 'data1']) 1711 1712 def test_tag_in_tag_many(self): 1713 for selector in ('html div', 'html body div', 'body div'): 1714 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) 1715 1716 def test_tag_no_match(self): 1717 self.assertEqual(len(self.soup.select('del')), 0) 1718 1719 def test_invalid_tag(self): 1720 self.assertRaises(ValueError, self.soup.select, 'tag%t') 1721 1722 def test_select_dashed_tag_ids(self): 1723 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) 1724 1725 def test_select_dashed_by_id(self): 1726 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') 1727 self.assertEqual(dashed[0].name, 'custom-dashed-tag') 1728 self.assertEqual(dashed[0]['id'], 'dash2') 1729 1730 def test_dashed_tag_text(self): 1731 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') 1732 1733 def test_select_dashed_matches_find_all(self): 1734 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) 1735 1736 def test_header_tags(self): 1737 self.assertSelectMultiple( 1738 ('h1', ['header1']), 1739 ('h2', ['header2', 'header3']), 1740 ) 1741 1742 def test_class_one(self): 1743 for selector in ('.onep', 'p.onep', 'html p.onep'): 1744 els = self.soup.select(selector) 1745 self.assertEqual(len(els), 1) 1746 self.assertEqual(els[0].name, 'p') 1747 self.assertEqual(els[0]['class'], ['onep']) 1748 1749 def test_class_mismatched_tag(self): 1750 els = self.soup.select('div.onep') 1751 self.assertEqual(len(els), 0) 1752 1753 def test_one_id(self): 1754 for selector in ('div#inner', '#inner', 'div div#inner'): 1755 self.assertSelects(selector, ['inner']) 1756 1757 def test_bad_id(self): 1758 els = self.soup.select('#doesnotexist') 1759 self.assertEqual(len(els), 0) 1760 1761 def test_items_in_id(self): 1762 els = self.soup.select('div#inner p') 1763 self.assertEqual(len(els), 3) 1764 for el in els: 1765 self.assertEqual(el.name, 'p') 1766 self.assertEqual(els[1]['class'], ['onep']) 1767 self.assertFalse(els[0].has_attr('class')) 1768 1769 def test_a_bunch_of_emptys(self): 1770 for selector in ('div#main del', 'div#main div.oops', 'div div#main'): 1771 self.assertEqual(len(self.soup.select(selector)), 0) 1772 1773 def test_multi_class_support(self): 1774 for selector in ('.class1', 'p.class1', '.class2', 'p.class2', 1775 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): 1776 self.assertSelects(selector, ['pmulti']) 1777 1778 def test_multi_class_selection(self): 1779 for selector in ('.class1.class3', '.class3.class2', 1780 '.class1.class2.class3'): 1781 self.assertSelects(selector, ['pmulti']) 1782 1783 def test_child_selector(self): 1784 self.assertSelects('.s1 > a', ['s1a1', 's1a2']) 1785 self.assertSelects('.s1 > a span', ['s1a2s1']) 1786 1787 def test_child_selector_id(self): 1788 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) 1789 1790 def test_attribute_equals(self): 1791 self.assertSelectMultiple( 1792 ('p[class="onep"]', ['p1']), 1793 ('p[id="p1"]', ['p1']), 1794 ('[class="onep"]', ['p1']), 1795 ('[id="p1"]', ['p1']), 1796 ('link[rel="stylesheet"]', ['l1']), 1797 ('link[type="text/css"]', ['l1']), 1798 ('link[href="blah.css"]', ['l1']), 1799 ('link[href="no-blah.css"]', []), 1800 ('[rel="stylesheet"]', ['l1']), 1801 ('[type="text/css"]', ['l1']), 1802 ('[href="blah.css"]', ['l1']), 1803 ('[href="no-blah.css"]', []), 1804 ('p[href="no-blah.css"]', []), 1805 ('[href="no-blah.css"]', []), 1806 ) 1807 1808 def test_attribute_tilde(self): 1809 self.assertSelectMultiple( 1810 ('p[class~="class1"]', ['pmulti']), 1811 ('p[class~="class2"]', ['pmulti']), 1812 ('p[class~="class3"]', ['pmulti']), 1813 ('[class~="class1"]', ['pmulti']), 1814 ('[class~="class2"]', ['pmulti']), 1815 ('[class~="class3"]', ['pmulti']), 1816 ('a[rel~="friend"]', ['bob']), 1817 ('a[rel~="met"]', ['bob']), 1818 ('[rel~="friend"]', ['bob']), 1819 ('[rel~="met"]', ['bob']), 1820 ) 1821 1822 def test_attribute_startswith(self): 1823 self.assertSelectMultiple( 1824 ('[rel^="style"]', ['l1']), 1825 ('link[rel^="style"]', ['l1']), 1826 ('notlink[rel^="notstyle"]', []), 1827 ('[rel^="notstyle"]', []), 1828 ('link[rel^="notstyle"]', []), 1829 ('link[href^="bla"]', ['l1']), 1830 ('a[href^="http://"]', ['bob', 'me']), 1831 ('[href^="http://"]', ['bob', 'me']), 1832 ('[id^="p"]', ['pmulti', 'p1']), 1833 ('[id^="m"]', ['me', 'main']), 1834 ('div[id^="m"]', ['main']), 1835 ('a[id^="m"]', ['me']), 1836 ('div[data-tag^="dashed"]', ['data1']) 1837 ) 1838 1839 def test_attribute_endswith(self): 1840 self.assertSelectMultiple( 1841 ('[href$=".css"]', ['l1']), 1842 ('link[href$=".css"]', ['l1']), 1843 ('link[id$="1"]', ['l1']), 1844 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), 1845 ('div[id$="1"]', ['data1']), 1846 ('[id$="noending"]', []), 1847 ) 1848 1849 def test_attribute_contains(self): 1850 self.assertSelectMultiple( 1851 # From test_attribute_startswith 1852 ('[rel*="style"]', ['l1']), 1853 ('link[rel*="style"]', ['l1']), 1854 ('notlink[rel*="notstyle"]', []), 1855 ('[rel*="notstyle"]', []), 1856 ('link[rel*="notstyle"]', []), 1857 ('link[href*="bla"]', ['l1']), 1858 ('[href*="http://"]', ['bob', 'me']), 1859 ('[id*="p"]', ['pmulti', 'p1']), 1860 ('div[id*="m"]', ['main']), 1861 ('a[id*="m"]', ['me']), 1862 # From test_attribute_endswith 1863 ('[href*=".css"]', ['l1']), 1864 ('link[href*=".css"]', ['l1']), 1865 ('link[id*="1"]', ['l1']), 1866 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), 1867 ('div[id*="1"]', ['data1']), 1868 ('[id*="noending"]', []), 1869 # New for this test 1870 ('[href*="."]', ['bob', 'me', 'l1']), 1871 ('a[href*="."]', ['bob', 'me']), 1872 ('link[href*="."]', ['l1']), 1873 ('div[id*="n"]', ['main', 'inner']), 1874 ('div[id*="nn"]', ['inner']), 1875 ('div[data-tag*="edval"]', ['data1']) 1876 ) 1877 1878 def test_attribute_exact_or_hypen(self): 1879 self.assertSelectMultiple( 1880 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), 1881 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), 1882 ('p[lang|="fr"]', ['lang-fr']), 1883 ('p[lang|="gb"]', []), 1884 ) 1885 1886 def test_attribute_exists(self): 1887 self.assertSelectMultiple( 1888 ('[rel]', ['l1', 'bob', 'me']), 1889 ('link[rel]', ['l1']), 1890 ('a[rel]', ['bob', 'me']), 1891 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), 1892 ('p[class]', ['p1', 'pmulti']), 1893 ('[blah]', []), 1894 ('p[blah]', []), 1895 ('div[data-tag]', ['data1']) 1896 ) 1897 1898 def test_unsupported_pseudoclass(self): 1899 self.assertRaises( 1900 NotImplementedError, self.soup.select, "a:no-such-pseudoclass") 1901 1902 self.assertRaises( 1903 NotImplementedError, self.soup.select, "a:nth-of-type(a)") 1904 1905 1906 def test_nth_of_type(self): 1907 # Try to select first paragraph 1908 els = self.soup.select('div#inner p:nth-of-type(1)') 1909 self.assertEqual(len(els), 1) 1910 self.assertEqual(els[0].string, 'Some text') 1911 1912 # Try to select third paragraph 1913 els = self.soup.select('div#inner p:nth-of-type(3)') 1914 self.assertEqual(len(els), 1) 1915 self.assertEqual(els[0].string, 'Another') 1916 1917 # Try to select (non-existent!) fourth paragraph 1918 els = self.soup.select('div#inner p:nth-of-type(4)') 1919 self.assertEqual(len(els), 0) 1920 1921 # Pass in an invalid value. 1922 self.assertRaises( 1923 ValueError, self.soup.select, 'div p:nth-of-type(0)') 1924 1925 def test_nth_of_type_direct_descendant(self): 1926 els = self.soup.select('div#inner > p:nth-of-type(1)') 1927 self.assertEqual(len(els), 1) 1928 self.assertEqual(els[0].string, 'Some text') 1929 1930 def test_id_child_selector_nth_of_type(self): 1931 self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) 1932 1933 def test_select_on_element(self): 1934 # Other tests operate on the tree; this operates on an element 1935 # within the tree. 1936 inner = self.soup.find("div", id="main") 1937 selected = inner.select("div") 1938 # The <div id="inner"> tag was selected. The <div id="footer"> 1939 # tag was not. 1940 self.assertSelectsIDs(selected, ['inner', 'data1']) 1941 1942 def test_overspecified_child_id(self): 1943 self.assertSelects(".fancy #inner", ['inner']) 1944 self.assertSelects(".normal #inner", []) 1945 1946 def test_adjacent_sibling_selector(self): 1947 self.assertSelects('#p1 + h2', ['header2']) 1948 self.assertSelects('#p1 + h2 + p', ['pmulti']) 1949 self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) 1950 self.assertEqual([], self.soup.select('#p1 + p')) 1951 1952 def test_general_sibling_selector(self): 1953 self.assertSelects('#p1 ~ h2', ['header2', 'header3']) 1954 self.assertSelects('#p1 ~ #header2', ['header2']) 1955 self.assertSelects('#p1 ~ h2 + a', ['me']) 1956 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) 1957 self.assertEqual([], self.soup.select('#inner ~ h2')) 1958 1959 def test_dangling_combinator(self): 1960 self.assertRaises(ValueError, self.soup.select, 'h1 >') 1961 1962 def test_sibling_combinator_wont_select_same_tag_twice(self): 1963 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) 1964 1965 # Test the selector grouping operator (the comma) 1966 def test_multiple_select(self): 1967 self.assertSelects('x, y', ['xid', 'yid']) 1968 1969 def test_multiple_select_with_no_space(self): 1970 self.assertSelects('x,y', ['xid', 'yid']) 1971 1972 def test_multiple_select_with_more_space(self): 1973 self.assertSelects('x, y', ['xid', 'yid']) 1974 1975 def test_multiple_select_duplicated(self): 1976 self.assertSelects('x, x', ['xid']) 1977 1978 def test_multiple_select_sibling(self): 1979 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) 1980 1981 def test_multiple_select_tag_and_direct_descendant(self): 1982 self.assertSelects('x, y > z', ['xid', 'zidb']) 1983 1984 def test_multiple_select_direct_descendant_and_tags(self): 1985 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) 1986 1987 def test_multiple_select_indirect_descendant(self): 1988 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) 1989 1990 def test_invalid_multiple_select(self): 1991 self.assertRaises(ValueError, self.soup.select, ',x, y') 1992 self.assertRaises(ValueError, self.soup.select, 'x,,y') 1993 1994 def test_multiple_select_attrs(self): 1995 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) 1996 1997 def test_multiple_select_ids(self): 1998 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) 1999 2000 def test_multiple_select_nested(self): 2001 self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) 2002 2003 2004 2005