1from bs4.dammit import EntitySubstitution 2 3class Formatter(EntitySubstitution): 4 """Describes a strategy to use when outputting a parse tree to a string. 5 6 Some parts of this strategy come from the distinction between 7 HTML4, HTML5, and XML. Others are configurable by the user. 8 9 Formatters are passed in as the `formatter` argument to methods 10 like `PageElement.encode`. Most people won't need to think about 11 formatters, and most people who need to think about them can pass 12 in one of these predefined strings as `formatter` rather than 13 making a new Formatter object: 14 15 For HTML documents: 16 * 'html' - HTML entity substitution for generic HTML documents. (default) 17 * 'html5' - HTML entity substitution for HTML5 documents, as 18 well as some optimizations in the way tags are rendered. 19 * 'minimal' - Only make the substitutions necessary to guarantee 20 valid HTML. 21 * None - Do not perform any substitution. This will be faster 22 but may result in invalid markup. 23 24 For XML documents: 25 * 'html' - Entity substitution for XHTML documents. 26 * 'minimal' - Only make the substitutions necessary to guarantee 27 valid XML. (default) 28 * None - Do not perform any substitution. This will be faster 29 but may result in invalid markup. 30 """ 31 # Registries of XML and HTML formatters. 32 XML_FORMATTERS = {} 33 HTML_FORMATTERS = {} 34 35 HTML = 'html' 36 XML = 'xml' 37 38 HTML_DEFAULTS = dict( 39 cdata_containing_tags=set(["script", "style"]), 40 ) 41 42 def _default(self, language, value, kwarg): 43 if value is not None: 44 return value 45 if language == self.XML: 46 return set() 47 return self.HTML_DEFAULTS[kwarg] 48 49 def __init__( 50 self, language=None, entity_substitution=None, 51 void_element_close_prefix='/', cdata_containing_tags=None, 52 empty_attributes_are_booleans=False, indent=1, 53 ): 54 r"""Constructor. 55 56 :param language: This should be Formatter.XML if you are formatting 57 XML markup and Formatter.HTML if you are formatting HTML markup. 58 59 :param entity_substitution: A function to call to replace special 60 characters with XML/HTML entities. For examples, see 61 bs4.dammit.EntitySubstitution.substitute_html and substitute_xml. 62 :param void_element_close_prefix: By default, void elements 63 are represented as <tag/> (XML rules) rather than <tag> 64 (HTML rules). To get <tag>, pass in the empty string. 65 :param cdata_containing_tags: The list of tags that are defined 66 as containing CDATA in this dialect. For example, in HTML, 67 <script> and <style> tags are defined as containing CDATA, 68 and their contents should not be formatted. 69 :param blank_attributes_are_booleans: Render attributes whose value 70 is the empty string as HTML-style boolean attributes. 71 (Attributes whose value is None are always rendered this way.) 72 73 :param indent: If indent is a non-negative integer or string, 74 then the contents of elements will be indented 75 appropriately when pretty-printing. An indent level of 0, 76 negative, or "" will only insert newlines. Using a 77 positive integer indent indents that many spaces per 78 level. If indent is a string (such as "\t"), that string 79 is used to indent each level. The default behavior is to 80 indent one space per level. 81 """ 82 self.language = language 83 self.entity_substitution = entity_substitution 84 self.void_element_close_prefix = void_element_close_prefix 85 self.cdata_containing_tags = self._default( 86 language, cdata_containing_tags, 'cdata_containing_tags' 87 ) 88 self.empty_attributes_are_booleans=empty_attributes_are_booleans 89 if indent is None: 90 indent = 0 91 if isinstance(indent, int): 92 if indent < 0: 93 indent = 0 94 indent = ' ' * indent 95 elif isinstance(indent, str): 96 indent = indent 97 else: 98 indent = ' ' 99 self.indent = indent 100 101 def substitute(self, ns): 102 """Process a string that needs to undergo entity substitution. 103 This may be a string encountered in an attribute value or as 104 text. 105 106 :param ns: A string. 107 :return: A string with certain characters replaced by named 108 or numeric entities. 109 """ 110 if not self.entity_substitution: 111 return ns 112 from .element import NavigableString 113 if (isinstance(ns, NavigableString) 114 and ns.parent is not None 115 and ns.parent.name in self.cdata_containing_tags): 116 # Do nothing. 117 return ns 118 # Substitute. 119 return self.entity_substitution(ns) 120 121 def attribute_value(self, value): 122 """Process the value of an attribute. 123 124 :param ns: A string. 125 :return: A string with certain characters replaced by named 126 or numeric entities. 127 """ 128 return self.substitute(value) 129 130 def attributes(self, tag): 131 """Reorder a tag's attributes however you want. 132 133 By default, attributes are sorted alphabetically. This makes 134 behavior consistent between Python 2 and Python 3, and preserves 135 backwards compatibility with older versions of Beautiful Soup. 136 137 If `empty_boolean_attributes` is True, then attributes whose 138 values are set to the empty string will be treated as boolean 139 attributes. 140 """ 141 if tag.attrs is None: 142 return [] 143 return sorted( 144 (k, (None if self.empty_attributes_are_booleans and v == '' else v)) 145 for k, v in list(tag.attrs.items()) 146 ) 147 148class HTMLFormatter(Formatter): 149 """A generic Formatter for HTML.""" 150 REGISTRY = {} 151 def __init__(self, *args, **kwargs): 152 super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) 153 154 155class XMLFormatter(Formatter): 156 """A generic Formatter for XML.""" 157 REGISTRY = {} 158 def __init__(self, *args, **kwargs): 159 super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) 160 161 162# Set up aliases for the default formatters. 163HTMLFormatter.REGISTRY['html'] = HTMLFormatter( 164 entity_substitution=EntitySubstitution.substitute_html 165) 166HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( 167 entity_substitution=EntitySubstitution.substitute_html, 168 void_element_close_prefix=None, 169 empty_attributes_are_booleans=True, 170) 171HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( 172 entity_substitution=EntitySubstitution.substitute_xml 173) 174HTMLFormatter.REGISTRY[None] = HTMLFormatter( 175 entity_substitution=None 176) 177XMLFormatter.REGISTRY["html"] = XMLFormatter( 178 entity_substitution=EntitySubstitution.substitute_html 179) 180XMLFormatter.REGISTRY["minimal"] = XMLFormatter( 181 entity_substitution=EntitySubstitution.substitute_xml 182) 183XMLFormatter.REGISTRY[None] = Formatter( 184 Formatter(Formatter.XML, entity_substitution=None) 185) 186