1# Use of this source code is governed by the MIT license. 2__license__ = "MIT" 3 4__all__ = [ 5 'LXMLTreeBuilderForXML', 6 'LXMLTreeBuilder', 7 ] 8 9try: 10 from collections.abc import Callable # Python 3.6 11except ImportError as e: 12 from collections import Callable 13 14from io import BytesIO 15from io import StringIO 16from lxml import etree 17from bs4.element import ( 18 Comment, 19 Doctype, 20 NamespacedAttribute, 21 ProcessingInstruction, 22 XMLProcessingInstruction, 23) 24from bs4.builder import ( 25 DetectsXMLParsedAsHTML, 26 FAST, 27 HTML, 28 HTMLTreeBuilder, 29 PERMISSIVE, 30 ParserRejectedMarkup, 31 TreeBuilder, 32 XML) 33from bs4.dammit import EncodingDetector 34 35LXML = 'lxml' 36 37def _invert(d): 38 "Invert a dictionary." 39 return dict((v,k) for k, v in list(d.items())) 40 41class LXMLTreeBuilderForXML(TreeBuilder): 42 DEFAULT_PARSER_CLASS = etree.XMLParser 43 44 is_xml = True 45 processing_instruction_class = XMLProcessingInstruction 46 47 NAME = "lxml-xml" 48 ALTERNATE_NAMES = ["xml"] 49 50 # Well, it's permissive by XML parser standards. 51 features = [NAME, LXML, XML, FAST, PERMISSIVE] 52 53 CHUNK_SIZE = 512 54 55 # This namespace mapping is specified in the XML Namespace 56 # standard. 57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') 58 59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) 60 61 # NOTE: If we parsed Element objects and looked at .sourceline, 62 # we'd be able to see the line numbers from the original document. 63 # But instead we build an XMLParser or HTMLParser object to serve 64 # as the target of parse messages, and those messages don't include 65 # line numbers. 66 # See: https://bugs.launchpad.net/lxml/+bug/1846906 67 68 def initialize_soup(self, soup): 69 """Let the BeautifulSoup object know about the standard namespace 70 mapping. 71 72 :param soup: A `BeautifulSoup`. 73 """ 74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup) 75 self._register_namespaces(self.DEFAULT_NSMAPS) 76 77 def _register_namespaces(self, mapping): 78 """Let the BeautifulSoup object know about namespaces encountered 79 while parsing the document. 80 81 This might be useful later on when creating CSS selectors. 82 83 This will track (almost) all namespaces, even ones that were 84 only in scope for part of the document. If two namespaces have 85 the same prefix, only the first one encountered will be 86 tracked. Un-prefixed namespaces are not tracked. 87 88 :param mapping: A dictionary mapping namespace prefixes to URIs. 89 """ 90 for key, value in list(mapping.items()): 91 # This is 'if key' and not 'if key is not None' because we 92 # don't track un-prefixed namespaces. Soupselect will 93 # treat an un-prefixed namespace as the default, which 94 # causes confusion in some cases. 95 if key and key not in self.soup._namespaces: 96 # Let the BeautifulSoup object know about a new namespace. 97 # If there are multiple namespaces defined with the same 98 # prefix, the first one in the document takes precedence. 99 self.soup._namespaces[key] = value 100 101 def default_parser(self, encoding): 102 """Find the default parser for the given encoding. 103 104 :param encoding: A string. 105 :return: Either a parser object or a class, which 106 will be instantiated with default arguments. 107 """ 108 if self._default_parser is not None: 109 return self._default_parser 110 return etree.XMLParser( 111 target=self, strip_cdata=False, recover=True, encoding=encoding) 112 113 def parser_for(self, encoding): 114 """Instantiate an appropriate parser for the given encoding. 115 116 :param encoding: A string. 117 :return: A parser object such as an `etree.XMLParser`. 118 """ 119 # Use the default parser. 120 parser = self.default_parser(encoding) 121 122 if isinstance(parser, Callable): 123 # Instantiate the parser with default arguments 124 parser = parser( 125 target=self, strip_cdata=False, recover=True, encoding=encoding 126 ) 127 return parser 128 129 def __init__(self, parser=None, empty_element_tags=None, **kwargs): 130 # TODO: Issue a warning if parser is present but not a 131 # callable, since that means there's no way to create new 132 # parsers for different encodings. 133 self._default_parser = parser 134 if empty_element_tags is not None: 135 self.empty_element_tags = set(empty_element_tags) 136 self.soup = None 137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] 139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs) 140 141 def _getNsTag(self, tag): 142 # Split the namespace URL out of a fully-qualified lxml tag 143 # name. Copied from lxml's src/lxml/sax.py. 144 if tag[0] == '{': 145 return tuple(tag[1:].split('}', 1)) 146 else: 147 return (None, tag) 148 149 def prepare_markup(self, markup, user_specified_encoding=None, 150 exclude_encodings=None, 151 document_declared_encoding=None): 152 """Run any preliminary steps necessary to make incoming markup 153 acceptable to the parser. 154 155 lxml really wants to get a bytestring and convert it to 156 Unicode itself. So instead of using UnicodeDammit to convert 157 the bytestring to Unicode using different encodings, this 158 implementation uses EncodingDetector to iterate over the 159 encodings, and tell lxml to try to parse the document as each 160 one in turn. 161 162 :param markup: Some markup -- hopefully a bytestring. 163 :param user_specified_encoding: The user asked to try this encoding. 164 :param document_declared_encoding: The markup itself claims to be 165 in this encoding. 166 :param exclude_encodings: The user asked _not_ to try any of 167 these encodings. 168 169 :yield: A series of 4-tuples: 170 (markup, encoding, declared encoding, 171 has undergone character replacement) 172 173 Each 4-tuple represents a strategy for converting the 174 document to Unicode and parsing it. Each strategy will be tried 175 in turn. 176 """ 177 is_html = not self.is_xml 178 if is_html: 179 self.processing_instruction_class = ProcessingInstruction 180 # We're in HTML mode, so if we're given XML, that's worth 181 # noting. 182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml( 183 markup, stacklevel=3 184 ) 185 else: 186 self.processing_instruction_class = XMLProcessingInstruction 187 188 if isinstance(markup, str): 189 # We were given Unicode. Maybe lxml can parse Unicode on 190 # this system? 191 192 # TODO: This is a workaround for 193 # https://bugs.launchpad.net/lxml/+bug/1948551. 194 # We can remove it once the upstream issue is fixed. 195 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}': 196 markup = markup[1:] 197 yield markup, None, document_declared_encoding, False 198 199 if isinstance(markup, str): 200 # No, apparently not. Convert the Unicode to UTF-8 and 201 # tell lxml to parse it as UTF-8. 202 yield (markup.encode("utf8"), "utf8", 203 document_declared_encoding, False) 204 205 # This was provided by the end-user; treat it as a known 206 # definite encoding per the algorithm laid out in the HTML5 207 # spec. (See the EncodingDetector class for details.) 208 known_definite_encodings = [user_specified_encoding] 209 210 # This was found in the document; treat it as a slightly lower-priority 211 # user encoding. 212 user_encodings = [document_declared_encoding] 213 detector = EncodingDetector( 214 markup, known_definite_encodings=known_definite_encodings, 215 user_encodings=user_encodings, is_html=is_html, 216 exclude_encodings=exclude_encodings 217 ) 218 for encoding in detector.encodings: 219 yield (detector.markup, encoding, document_declared_encoding, False) 220 221 def feed(self, markup): 222 if isinstance(markup, bytes): 223 markup = BytesIO(markup) 224 elif isinstance(markup, str): 225 markup = StringIO(markup) 226 227 # Call feed() at least once, even if the markup is empty, 228 # or the parser won't be initialized. 229 data = markup.read(self.CHUNK_SIZE) 230 try: 231 self.parser = self.parser_for(self.soup.original_encoding) 232 self.parser.feed(data) 233 while len(data) != 0: 234 # Now call feed() on the rest of the data, chunk by chunk. 235 data = markup.read(self.CHUNK_SIZE) 236 if len(data) != 0: 237 self.parser.feed(data) 238 self.parser.close() 239 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 240 raise ParserRejectedMarkup(e) 241 242 def close(self): 243 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 244 245 def start(self, name, attrs, nsmap={}): 246 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 247 attrs = dict(attrs) 248 nsprefix = None 249 # Invert each namespace map as it comes in. 250 if len(nsmap) == 0 and len(self.nsmaps) > 1: 251 # There are no new namespaces for this tag, but 252 # non-default namespaces are in play, so we need a 253 # separate tag stack to know when they end. 254 self.nsmaps.append(None) 255 elif len(nsmap) > 0: 256 # A new namespace mapping has come into play. 257 258 # First, Let the BeautifulSoup object know about it. 259 self._register_namespaces(nsmap) 260 261 # Then, add it to our running list of inverted namespace 262 # mappings. 263 self.nsmaps.append(_invert(nsmap)) 264 265 # The currently active namespace prefixes have 266 # changed. Calculate the new mapping so it can be stored 267 # with all Tag objects created while these prefixes are in 268 # scope. 269 current_mapping = dict(self.active_namespace_prefixes[-1]) 270 current_mapping.update(nsmap) 271 272 # We should not track un-prefixed namespaces as we can only hold one 273 # and it will be recognized as the default namespace by soupsieve, 274 # which may be confusing in some situations. 275 if '' in current_mapping: 276 del current_mapping[''] 277 self.active_namespace_prefixes.append(current_mapping) 278 279 # Also treat the namespace mapping as a set of attributes on the 280 # tag, so we can recreate it later. 281 attrs = attrs.copy() 282 for prefix, namespace in list(nsmap.items()): 283 attribute = NamespacedAttribute( 284 "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 285 attrs[attribute] = namespace 286 287 # Namespaces are in play. Find any attributes that came in 288 # from lxml with namespaces attached to their names, and 289 # turn then into NamespacedAttribute objects. 290 new_attrs = {} 291 for attr, value in list(attrs.items()): 292 namespace, attr = self._getNsTag(attr) 293 if namespace is None: 294 new_attrs[attr] = value 295 else: 296 nsprefix = self._prefix_for_namespace(namespace) 297 attr = NamespacedAttribute(nsprefix, attr, namespace) 298 new_attrs[attr] = value 299 attrs = new_attrs 300 301 namespace, name = self._getNsTag(name) 302 nsprefix = self._prefix_for_namespace(namespace) 303 self.soup.handle_starttag( 304 name, namespace, nsprefix, attrs, 305 namespaces=self.active_namespace_prefixes[-1] 306 ) 307 308 def _prefix_for_namespace(self, namespace): 309 """Find the currently active prefix for the given namespace.""" 310 if namespace is None: 311 return None 312 for inverted_nsmap in reversed(self.nsmaps): 313 if inverted_nsmap is not None and namespace in inverted_nsmap: 314 return inverted_nsmap[namespace] 315 return None 316 317 def end(self, name): 318 self.soup.endData() 319 completed_tag = self.soup.tagStack[-1] 320 namespace, name = self._getNsTag(name) 321 nsprefix = None 322 if namespace is not None: 323 for inverted_nsmap in reversed(self.nsmaps): 324 if inverted_nsmap is not None and namespace in inverted_nsmap: 325 nsprefix = inverted_nsmap[namespace] 326 break 327 self.soup.handle_endtag(name, nsprefix) 328 if len(self.nsmaps) > 1: 329 # This tag, or one of its parents, introduced a namespace 330 # mapping, so pop it off the stack. 331 out_of_scope_nsmap = self.nsmaps.pop() 332 333 if out_of_scope_nsmap is not None: 334 # This tag introduced a namespace mapping which is no 335 # longer in scope. Recalculate the currently active 336 # namespace prefixes. 337 self.active_namespace_prefixes.pop() 338 339 def pi(self, target, data): 340 self.soup.endData() 341 data = target + ' ' + data 342 self.soup.handle_data(data) 343 self.soup.endData(self.processing_instruction_class) 344 345 def data(self, content): 346 self.soup.handle_data(content) 347 348 def doctype(self, name, pubid, system): 349 self.soup.endData() 350 doctype = Doctype.for_name_and_ids(name, pubid, system) 351 self.soup.object_was_parsed(doctype) 352 353 def comment(self, content): 354 "Handle comments as Comment objects." 355 self.soup.endData() 356 self.soup.handle_data(content) 357 self.soup.endData(Comment) 358 359 def test_fragment_to_document(self, fragment): 360 """See `TreeBuilder`.""" 361 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 362 363 364class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 365 366 NAME = LXML 367 ALTERNATE_NAMES = ["lxml-html"] 368 369 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 370 is_xml = False 371 processing_instruction_class = ProcessingInstruction 372 373 def default_parser(self, encoding): 374 return etree.HTMLParser 375 376 def feed(self, markup): 377 encoding = self.soup.original_encoding 378 try: 379 self.parser = self.parser_for(encoding) 380 self.parser.feed(markup) 381 self.parser.close() 382 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 383 raise ParserRejectedMarkup(e) 384 385 386 def test_fragment_to_document(self, fragment): 387 """See `TreeBuilder`.""" 388 return '<html><body>%s</body></html>' % fragment 389