1# Use of this source code is governed by the MIT license. 2__license__ = "MIT" 3 4__all__ = [ 5 'HTML5TreeBuilder', 6 ] 7 8import warnings 9import re 10from bs4.builder import ( 11 DetectsXMLParsedAsHTML, 12 PERMISSIVE, 13 HTML, 14 HTML_5, 15 HTMLTreeBuilder, 16 ) 17from bs4.element import ( 18 NamespacedAttribute, 19 nonwhitespace_re, 20) 21import html5lib 22from html5lib.constants import ( 23 namespaces, 24 prefixes, 25 ) 26from bs4.element import ( 27 Comment, 28 Doctype, 29 NavigableString, 30 Tag, 31 ) 32 33try: 34 # Pre-0.99999999 35 from html5lib.treebuilders import _base as treebuilder_base 36 new_html5lib = False 37except ImportError as e: 38 # 0.99999999 and up 39 from html5lib.treebuilders import base as treebuilder_base 40 new_html5lib = True 41 42class HTML5TreeBuilder(HTMLTreeBuilder): 43 """Use html5lib to build a tree. 44 45 Note that this TreeBuilder does not support some features common 46 to HTML TreeBuilders. Some of these features could theoretically 47 be implemented, but at the very least it's quite difficult, 48 because html5lib moves the parse tree around as it's being built. 49 50 * This TreeBuilder doesn't use different subclasses of NavigableString 51 based on the name of the tag in which the string was found. 52 53 * You can't use a SoupStrainer to parse only part of a document. 54 """ 55 56 NAME = "html5lib" 57 58 features = [NAME, PERMISSIVE, HTML_5, HTML] 59 60 # html5lib can tell us which line number and position in the 61 # original file is the source of an element. 62 TRACKS_LINE_NUMBERS = True 63 64 def prepare_markup(self, markup, user_specified_encoding, 65 document_declared_encoding=None, exclude_encodings=None): 66 # Store the user-specified encoding for use later on. 67 self.user_specified_encoding = user_specified_encoding 68 69 # document_declared_encoding and exclude_encodings aren't used 70 # ATM because the html5lib TreeBuilder doesn't use 71 # UnicodeDammit. 72 if exclude_encodings: 73 warnings.warn( 74 "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.", 75 stacklevel=3 76 ) 77 78 # html5lib only parses HTML, so if it's given XML that's worth 79 # noting. 80 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml( 81 markup, stacklevel=3 82 ) 83 84 yield (markup, None, None, False) 85 86 # These methods are defined by Beautiful Soup. 87 def feed(self, markup): 88 if self.soup.parse_only is not None: 89 warnings.warn( 90 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", 91 stacklevel=4 92 ) 93 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 94 self.underlying_builder.parser = parser 95 extra_kwargs = dict() 96 if not isinstance(markup, str): 97 if new_html5lib: 98 extra_kwargs['override_encoding'] = self.user_specified_encoding 99 else: 100 extra_kwargs['encoding'] = self.user_specified_encoding 101 doc = parser.parse(markup, **extra_kwargs) 102 103 # Set the character encoding detected by the tokenizer. 104 if isinstance(markup, str): 105 # We need to special-case this because html5lib sets 106 # charEncoding to UTF-8 if it gets Unicode input. 107 doc.original_encoding = None 108 else: 109 original_encoding = parser.tokenizer.stream.charEncoding[0] 110 if not isinstance(original_encoding, str): 111 # In 0.99999999 and up, the encoding is an html5lib 112 # Encoding object. We want to use a string for compatibility 113 # with other tree builders. 114 original_encoding = original_encoding.name 115 doc.original_encoding = original_encoding 116 self.underlying_builder.parser = None 117 118 def create_treebuilder(self, namespaceHTMLElements): 119 self.underlying_builder = TreeBuilderForHtml5lib( 120 namespaceHTMLElements, self.soup, 121 store_line_numbers=self.store_line_numbers 122 ) 123 return self.underlying_builder 124 125 def test_fragment_to_document(self, fragment): 126 """See `TreeBuilder`.""" 127 return '<html><head></head><body>%s</body></html>' % fragment 128 129 130class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 131 132 def __init__(self, namespaceHTMLElements, soup=None, 133 store_line_numbers=True, **kwargs): 134 if soup: 135 self.soup = soup 136 else: 137 from bs4 import BeautifulSoup 138 # TODO: Why is the parser 'html.parser' here? To avoid an 139 # infinite loop? 140 self.soup = BeautifulSoup( 141 "", "html.parser", store_line_numbers=store_line_numbers, 142 **kwargs 143 ) 144 # TODO: What are **kwargs exactly? Should they be passed in 145 # here in addition to/instead of being passed to the BeautifulSoup 146 # constructor? 147 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 148 149 # This will be set later to an html5lib.html5parser.HTMLParser 150 # object, which we can use to track the current line number. 151 self.parser = None 152 self.store_line_numbers = store_line_numbers 153 154 def documentClass(self): 155 self.soup.reset() 156 return Element(self.soup, self.soup, None) 157 158 def insertDoctype(self, token): 159 name = token["name"] 160 publicId = token["publicId"] 161 systemId = token["systemId"] 162 163 doctype = Doctype.for_name_and_ids(name, publicId, systemId) 164 self.soup.object_was_parsed(doctype) 165 166 def elementClass(self, name, namespace): 167 kwargs = {} 168 if self.parser and self.store_line_numbers: 169 # This represents the point immediately after the end of the 170 # tag. We don't know when the tag started, but we do know 171 # where it ended -- the character just before this one. 172 sourceline, sourcepos = self.parser.tokenizer.stream.position() 173 kwargs['sourceline'] = sourceline 174 kwargs['sourcepos'] = sourcepos-1 175 tag = self.soup.new_tag(name, namespace, **kwargs) 176 177 return Element(tag, self.soup, namespace) 178 179 def commentClass(self, data): 180 return TextNode(Comment(data), self.soup) 181 182 def fragmentClass(self): 183 from bs4 import BeautifulSoup 184 # TODO: Why is the parser 'html.parser' here? To avoid an 185 # infinite loop? 186 self.soup = BeautifulSoup("", "html.parser") 187 self.soup.name = "[document_fragment]" 188 return Element(self.soup, self.soup, None) 189 190 def appendChild(self, node): 191 # XXX This code is not covered by the BS4 tests. 192 self.soup.append(node.element) 193 194 def getDocument(self): 195 return self.soup 196 197 def getFragment(self): 198 return treebuilder_base.TreeBuilder.getFragment(self).element 199 200 def testSerializer(self, element): 201 from bs4 import BeautifulSoup 202 rv = [] 203 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') 204 205 def serializeElement(element, indent=0): 206 if isinstance(element, BeautifulSoup): 207 pass 208 if isinstance(element, Doctype): 209 m = doctype_re.match(element) 210 if m: 211 name = m.group(1) 212 if m.lastindex > 1: 213 publicId = m.group(2) or "" 214 systemId = m.group(3) or m.group(4) or "" 215 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % 216 (' ' * indent, name, publicId, systemId)) 217 else: 218 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) 219 else: 220 rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) 221 elif isinstance(element, Comment): 222 rv.append("|%s<!-- %s -->" % (' ' * indent, element)) 223 elif isinstance(element, NavigableString): 224 rv.append("|%s\"%s\"" % (' ' * indent, element)) 225 else: 226 if element.namespace: 227 name = "%s %s" % (prefixes[element.namespace], 228 element.name) 229 else: 230 name = element.name 231 rv.append("|%s<%s>" % (' ' * indent, name)) 232 if element.attrs: 233 attributes = [] 234 for name, value in list(element.attrs.items()): 235 if isinstance(name, NamespacedAttribute): 236 name = "%s %s" % (prefixes[name.namespace], name.name) 237 if isinstance(value, list): 238 value = " ".join(value) 239 attributes.append((name, value)) 240 241 for name, value in sorted(attributes): 242 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) 243 indent += 2 244 for child in element.children: 245 serializeElement(child, indent) 246 serializeElement(element, 0) 247 248 return "\n".join(rv) 249 250class AttrList(object): 251 def __init__(self, element): 252 self.element = element 253 self.attrs = dict(self.element.attrs) 254 def __iter__(self): 255 return list(self.attrs.items()).__iter__() 256 def __setitem__(self, name, value): 257 # If this attribute is a multi-valued attribute for this element, 258 # turn its value into a list. 259 list_attr = self.element.cdata_list_attributes or {} 260 if (name in list_attr.get('*', []) 261 or (self.element.name in list_attr 262 and name in list_attr.get(self.element.name, []))): 263 # A node that is being cloned may have already undergone 264 # this procedure. 265 if not isinstance(value, list): 266 value = nonwhitespace_re.findall(value) 267 self.element[name] = value 268 def items(self): 269 return list(self.attrs.items()) 270 def keys(self): 271 return list(self.attrs.keys()) 272 def __len__(self): 273 return len(self.attrs) 274 def __getitem__(self, name): 275 return self.attrs[name] 276 def __contains__(self, name): 277 return name in list(self.attrs.keys()) 278 279 280class Element(treebuilder_base.Node): 281 def __init__(self, element, soup, namespace): 282 treebuilder_base.Node.__init__(self, element.name) 283 self.element = element 284 self.soup = soup 285 self.namespace = namespace 286 287 def appendChild(self, node): 288 string_child = child = None 289 if isinstance(node, str): 290 # Some other piece of code decided to pass in a string 291 # instead of creating a TextElement object to contain the 292 # string. 293 string_child = child = node 294 elif isinstance(node, Tag): 295 # Some other piece of code decided to pass in a Tag 296 # instead of creating an Element object to contain the 297 # Tag. 298 child = node 299 elif node.element.__class__ == NavigableString: 300 string_child = child = node.element 301 node.parent = self 302 else: 303 child = node.element 304 node.parent = self 305 306 if not isinstance(child, str) and child.parent is not None: 307 node.element.extract() 308 309 if (string_child is not None and self.element.contents 310 and self.element.contents[-1].__class__ == NavigableString): 311 # We are appending a string onto another string. 312 # TODO This has O(n^2) performance, for input like 313 # "a</a>a</a>a</a>..." 314 old_element = self.element.contents[-1] 315 new_element = self.soup.new_string(old_element + string_child) 316 old_element.replace_with(new_element) 317 self.soup._most_recent_element = new_element 318 else: 319 if isinstance(node, str): 320 # Create a brand new NavigableString from this string. 321 child = self.soup.new_string(node) 322 323 # Tell Beautiful Soup to act as if it parsed this element 324 # immediately after the parent's last descendant. (Or 325 # immediately after the parent, if it has no children.) 326 if self.element.contents: 327 most_recent_element = self.element._last_descendant(False) 328 elif self.element.next_element is not None: 329 # Something from further ahead in the parse tree is 330 # being inserted into this earlier element. This is 331 # very annoying because it means an expensive search 332 # for the last element in the tree. 333 most_recent_element = self.soup._last_descendant() 334 else: 335 most_recent_element = self.element 336 337 self.soup.object_was_parsed( 338 child, parent=self.element, 339 most_recent_element=most_recent_element) 340 341 def getAttributes(self): 342 if isinstance(self.element, Comment): 343 return {} 344 return AttrList(self.element) 345 346 def setAttributes(self, attributes): 347 if attributes is not None and len(attributes) > 0: 348 converted_attributes = [] 349 for name, value in list(attributes.items()): 350 if isinstance(name, tuple): 351 new_name = NamespacedAttribute(*name) 352 del attributes[name] 353 attributes[new_name] = value 354 355 self.soup.builder._replace_cdata_list_attribute_values( 356 self.name, attributes) 357 for name, value in list(attributes.items()): 358 self.element[name] = value 359 360 # The attributes may contain variables that need substitution. 361 # Call set_up_substitutions manually. 362 # 363 # The Tag constructor called this method when the Tag was created, 364 # but we just set/changed the attributes, so call it again. 365 self.soup.builder.set_up_substitutions(self.element) 366 attributes = property(getAttributes, setAttributes) 367 368 def insertText(self, data, insertBefore=None): 369 text = TextNode(self.soup.new_string(data), self.soup) 370 if insertBefore: 371 self.insertBefore(text, insertBefore) 372 else: 373 self.appendChild(text) 374 375 def insertBefore(self, node, refNode): 376 index = self.element.index(refNode.element) 377 if (node.element.__class__ == NavigableString and self.element.contents 378 and self.element.contents[index-1].__class__ == NavigableString): 379 # (See comments in appendChild) 380 old_node = self.element.contents[index-1] 381 new_str = self.soup.new_string(old_node + node.element) 382 old_node.replace_with(new_str) 383 else: 384 self.element.insert(index, node.element) 385 node.parent = self 386 387 def removeChild(self, node): 388 node.element.extract() 389 390 def reparentChildren(self, new_parent): 391 """Move all of this tag's children into another tag.""" 392 # print("MOVE", self.element.contents) 393 # print("FROM", self.element) 394 # print("TO", new_parent.element) 395 396 element = self.element 397 new_parent_element = new_parent.element 398 # Determine what this tag's next_element will be once all the children 399 # are removed. 400 final_next_element = element.next_sibling 401 402 new_parents_last_descendant = new_parent_element._last_descendant(False, False) 403 if len(new_parent_element.contents) > 0: 404 # The new parent already contains children. We will be 405 # appending this tag's children to the end. 406 new_parents_last_child = new_parent_element.contents[-1] 407 new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 408 else: 409 # The new parent contains no children. 410 new_parents_last_child = None 411 new_parents_last_descendant_next_element = new_parent_element.next_element 412 413 to_append = element.contents 414 if len(to_append) > 0: 415 # Set the first child's previous_element and previous_sibling 416 # to elements within the new parent 417 first_child = to_append[0] 418 if new_parents_last_descendant is not None: 419 first_child.previous_element = new_parents_last_descendant 420 else: 421 first_child.previous_element = new_parent_element 422 first_child.previous_sibling = new_parents_last_child 423 if new_parents_last_descendant is not None: 424 new_parents_last_descendant.next_element = first_child 425 else: 426 new_parent_element.next_element = first_child 427 if new_parents_last_child is not None: 428 new_parents_last_child.next_sibling = first_child 429 430 # Find the very last element being moved. It is now the 431 # parent's last descendant. It has no .next_sibling and 432 # its .next_element is whatever the previous last 433 # descendant had. 434 last_childs_last_descendant = to_append[-1]._last_descendant(False, True) 435 436 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element 437 if new_parents_last_descendant_next_element is not None: 438 # TODO: This code has no test coverage and I'm not sure 439 # how to get html5lib to go through this path, but it's 440 # just the other side of the previous line. 441 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant 442 last_childs_last_descendant.next_sibling = None 443 444 for child in to_append: 445 child.parent = new_parent_element 446 new_parent_element.contents.append(child) 447 448 # Now that this element has no children, change its .next_element. 449 element.contents = [] 450 element.next_element = final_next_element 451 452 # print("DONE WITH MOVE") 453 # print("FROM", self.element) 454 # print("TO", new_parent_element) 455 456 def cloneNode(self): 457 tag = self.soup.new_tag(self.element.name, self.namespace) 458 node = Element(tag, self.soup, self.namespace) 459 for key,value in self.attributes: 460 node.attributes[key] = value 461 return node 462 463 def hasContent(self): 464 return self.element.contents 465 466 def getNameTuple(self): 467 if self.namespace == None: 468 return namespaces["html"], self.name 469 else: 470 return self.namespace, self.name 471 472 nameTuple = property(getNameTuple) 473 474class TextNode(Element): 475 def __init__(self, element, soup): 476 treebuilder_base.Node.__init__(self, None) 477 self.element = element 478 self.soup = soup 479 480 def cloneNode(self): 481 raise NotImplementedError 482