1from collections import defaultdict 2import itertools 3import sys 4from bs4.element import ( 5 CharsetMetaAttributeValue, 6 ContentMetaAttributeValue, 7 whitespace_re 8 ) 9 10__all__ = [ 11 'HTMLTreeBuilder', 12 'SAXTreeBuilder', 13 'TreeBuilder', 14 'TreeBuilderRegistry', 15 ] 16 17# Some useful features for a TreeBuilder to have. 18FAST = 'fast' 19PERMISSIVE = 'permissive' 20STRICT = 'strict' 21XML = 'xml' 22HTML = 'html' 23HTML_5 = 'html5' 24 25 26class TreeBuilderRegistry(object): 27 28 def __init__(self): 29 self.builders_for_feature = defaultdict(list) 30 self.builders = [] 31 32 def register(self, treebuilder_class): 33 """Register a treebuilder based on its advertised features.""" 34 for feature in treebuilder_class.features: 35 self.builders_for_feature[feature].insert(0, treebuilder_class) 36 self.builders.insert(0, treebuilder_class) 37 38 def lookup(self, *features): 39 if len(self.builders) == 0: 40 # There are no builders at all. 41 return None 42 43 if len(features) == 0: 44 # They didn't ask for any features. Give them the most 45 # recently registered builder. 46 return self.builders[0] 47 48 # Go down the list of features in order, and eliminate any builders 49 # that don't match every feature. 50 features = list(features) 51 features.reverse() 52 candidates = None 53 candidate_set = None 54 while len(features) > 0: 55 feature = features.pop() 56 we_have_the_feature = self.builders_for_feature.get(feature, []) 57 if len(we_have_the_feature) > 0: 58 if candidates is None: 59 candidates = we_have_the_feature 60 candidate_set = set(candidates) 61 else: 62 # Eliminate any candidates that don't have this feature. 63 candidate_set = candidate_set.intersection( 64 set(we_have_the_feature)) 65 66 # The only valid candidates are the ones in candidate_set. 67 # Go through the original list of candidates and pick the first one 68 # that's in candidate_set. 69 if candidate_set is None: 70 return None 71 for candidate in candidates: 72 if candidate in candidate_set: 73 return candidate 74 return None 75 76# The BeautifulSoup class will take feature lists from developers and use them 77# to look up builders in this registry. 78builder_registry = TreeBuilderRegistry() 79 80class TreeBuilder(object): 81 """Turn a document into a Beautiful Soup object tree.""" 82 83 NAME = "[Unknown tree builder]" 84 ALTERNATE_NAMES = [] 85 features = [] 86 87 is_xml = False 88 picklable = False 89 preserve_whitespace_tags = set() 90 empty_element_tags = None # A tag will be considered an empty-element 91 # tag when and only when it has no contents. 92 93 # A value for these tag/attribute combinations is a space- or 94 # comma-separated list of CDATA, rather than a single CDATA. 95 cdata_list_attributes = {} 96 97 98 def __init__(self): 99 self.soup = None 100 101 def reset(self): 102 pass 103 104 def can_be_empty_element(self, tag_name): 105 """Might a tag with this name be an empty-element tag? 106 107 The final markup may or may not actually present this tag as 108 self-closing. 109 110 For instance: an HTMLBuilder does not consider a <p> tag to be 111 an empty-element tag (it's not in 112 HTMLBuilder.empty_element_tags). This means an empty <p> tag 113 will be presented as "<p></p>", not "<p />". 114 115 The default implementation has no opinion about which tags are 116 empty-element tags, so a tag will be presented as an 117 empty-element tag if and only if it has no contents. 118 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will 119 be left alone. 120 """ 121 if self.empty_element_tags is None: 122 return True 123 return tag_name in self.empty_element_tags 124 125 def feed(self, markup): 126 raise NotImplementedError() 127 128 def prepare_markup(self, markup, user_specified_encoding=None, 129 document_declared_encoding=None): 130 return markup, None, None, False 131 132 def test_fragment_to_document(self, fragment): 133 """Wrap an HTML fragment to make it look like a document. 134 135 Different parsers do this differently. For instance, lxml 136 introduces an empty <head> tag, and html5lib 137 doesn't. Abstracting this away lets us write simple tests 138 which run HTML fragments through the parser and compare the 139 results against other HTML fragments. 140 141 This method should not be used outside of tests. 142 """ 143 return fragment 144 145 def set_up_substitutions(self, tag): 146 return False 147 148 def _replace_cdata_list_attribute_values(self, tag_name, attrs): 149 """Replaces class="foo bar" with class=["foo", "bar"] 150 151 Modifies its input in place. 152 """ 153 if not attrs: 154 return attrs 155 if self.cdata_list_attributes: 156 universal = self.cdata_list_attributes.get('*', []) 157 tag_specific = self.cdata_list_attributes.get( 158 tag_name.lower(), None) 159 for attr in list(attrs.keys()): 160 if attr in universal or (tag_specific and attr in tag_specific): 161 # We have a "class"-type attribute whose string 162 # value is a whitespace-separated list of 163 # values. Split it into a list. 164 value = attrs[attr] 165 if isinstance(value, str): 166 values = whitespace_re.split(value) 167 else: 168 # html5lib sometimes calls setAttributes twice 169 # for the same tag when rearranging the parse 170 # tree. On the second call the attribute value 171 # here is already a list. If this happens, 172 # leave the value alone rather than trying to 173 # split it again. 174 values = value 175 attrs[attr] = values 176 return attrs 177 178class SAXTreeBuilder(TreeBuilder): 179 """A Beautiful Soup treebuilder that listens for SAX events.""" 180 181 def feed(self, markup): 182 raise NotImplementedError() 183 184 def close(self): 185 pass 186 187 def startElement(self, name, attrs): 188 attrs = dict((key[1], value) for key, value in list(attrs.items())) 189 #print "Start %s, %r" % (name, attrs) 190 self.soup.handle_starttag(name, attrs) 191 192 def endElement(self, name): 193 #print "End %s" % name 194 self.soup.handle_endtag(name) 195 196 def startElementNS(self, nsTuple, nodeName, attrs): 197 # Throw away (ns, nodeName) for now. 198 self.startElement(nodeName, attrs) 199 200 def endElementNS(self, nsTuple, nodeName): 201 # Throw away (ns, nodeName) for now. 202 self.endElement(nodeName) 203 #handler.endElementNS((ns, node.nodeName), node.nodeName) 204 205 def startPrefixMapping(self, prefix, nodeValue): 206 # Ignore the prefix for now. 207 pass 208 209 def endPrefixMapping(self, prefix): 210 # Ignore the prefix for now. 211 # handler.endPrefixMapping(prefix) 212 pass 213 214 def characters(self, content): 215 self.soup.handle_data(content) 216 217 def startDocument(self): 218 pass 219 220 def endDocument(self): 221 pass 222 223 224class HTMLTreeBuilder(TreeBuilder): 225 """This TreeBuilder knows facts about HTML. 226 227 Such as which tags are empty-element tags. 228 """ 229 230 preserve_whitespace_tags = set(['pre', 'textarea']) 231 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 232 'spacer', 'link', 'frame', 'base']) 233 234 # The HTML standard defines these attributes as containing a 235 # space-separated list of values, not a single value. That is, 236 # class="foo bar" means that the 'class' attribute has two values, 237 # 'foo' and 'bar', not the single value 'foo bar'. When we 238 # encounter one of these attributes, we will parse its value into 239 # a list of values if possible. Upon output, the list will be 240 # converted back into a string. 241 cdata_list_attributes = { 242 "*" : ['class', 'accesskey', 'dropzone'], 243 "a" : ['rel', 'rev'], 244 "link" : ['rel', 'rev'], 245 "td" : ["headers"], 246 "th" : ["headers"], 247 "td" : ["headers"], 248 "form" : ["accept-charset"], 249 "object" : ["archive"], 250 251 # These are HTML5 specific, as are *.accesskey and *.dropzone above. 252 "area" : ["rel"], 253 "icon" : ["sizes"], 254 "iframe" : ["sandbox"], 255 "output" : ["for"], 256 } 257 258 def set_up_substitutions(self, tag): 259 # We are only interested in <meta> tags 260 if tag.name != 'meta': 261 return False 262 263 http_equiv = tag.get('http-equiv') 264 content = tag.get('content') 265 charset = tag.get('charset') 266 267 # We are interested in <meta> tags that say what encoding the 268 # document was originally in. This means HTML 5-style <meta> 269 # tags that provide the "charset" attribute. It also means 270 # HTML 4-style <meta> tags that provide the "content" 271 # attribute and have "http-equiv" set to "content-type". 272 # 273 # In both cases we will replace the value of the appropriate 274 # attribute with a standin object that can take on any 275 # encoding. 276 meta_encoding = None 277 if charset is not None: 278 # HTML 5 style: 279 # <meta charset="utf8"> 280 meta_encoding = charset 281 tag['charset'] = CharsetMetaAttributeValue(charset) 282 283 elif (content is not None and http_equiv is not None 284 and http_equiv.lower() == 'content-type'): 285 # HTML 4 style: 286 # <meta http-equiv="content-type" content="text/html; charset=utf8"> 287 tag['content'] = ContentMetaAttributeValue(content) 288 289 return (meta_encoding is not None) 290 291def register_treebuilders_from(module): 292 """Copy TreeBuilders from the given module into this module.""" 293 # I'm fairly sure this is not the best way to do this. 294 this_module = sys.modules['bs4.builder'] 295 for name in module.__all__: 296 obj = getattr(module, name) 297 298 if issubclass(obj, TreeBuilder): 299 setattr(this_module, name, obj) 300 this_module.__all__.append(name) 301 # Register the builder while we're at it. 302 this_module.builder_registry.register(obj) 303 304class ParserRejectedMarkup(Exception): 305 pass 306 307# Builders are registered in reverse order of priority, so that custom 308# builder registrations will take precedence. In general, we want lxml 309# to take precedence over html5lib, because it's faster. And we only 310# want to use HTMLParser as a last result. 311from . import _htmlparser 312register_treebuilders_from(_htmlparser) 313try: 314 from . import _html5lib 315 register_treebuilders_from(_html5lib) 316except ImportError: 317 # They don't have html5lib installed. 318 pass 319try: 320 from . import _lxml 321 register_treebuilders_from(_lxml) 322except ImportError: 323 # They don't have lxml installed. 324 pass 325