## HTMLTidy.py, v0.3 ## Copyright (C) 2002, 2003, 2004 Gary Benson ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. """Ultra-liberal HTML to XHTML convertor. Use it like this: import HTMLTidy xhtml = HTMLTidy.htmlTidy(html) It is designed to always produce some kind of output even for the most bizarrely malformed input, and that multiple passes should not change the data (ie that htmlTidy(x) == htmlTidy(htmlTidy(x)) for all x). If it fails on either of these counts then I'd like to know about it. In addition to the normal HTML conventions blank lines are considered as new paragraphs except within
 blocks, and the characters '<',
'>' and '&' will be treated as text unless in context.  The SGML
convention of using '' to close the last open tag is also
supported.

It's response to transposed tags (like 'foobar') leaves
something to be desired.
"""

import os
import re
import sgmllib
import string

##############################################################################

# Inline tags are those that may be freely mixed with text (well,
# CDATA. Whatever...).  In this file we refer to a mixture of inline
# tags and text as inline data.
inline_tags = ("a", "code", "em", "strong", "img", "br", "span",
               "sub", "sup", "input")

# Flow tags may contain inline data and only inline data.
flow_tags = ("p", "pre")

# List tags contain only block tags.  (Actually, they should contain
# only 
  • tags but we don't check for this.) list_tags = ("ol", "ul") # Block tags may contain other block tags, list tags, and inline data # (which will have

    tags inserted around it (TODO don't insert

    # tags when not necessary; perhaps another callback to remove

    tags # when a block tag contains only one child)). block_tags = ("blockquote", "li", "div", "form", "table", "tr", "td") # The following tags are always empty, and the trailing slash may be # omitted. empty_tags = ("img", "br", "input") # All entities not in this list will be converted to character references. allowed_entities = ("quot", "amp", "lt", "gt", "apos") # Files containing entity definitions. These should be downloaded # from http://www.w3.org/TR/xhtml1/DTD/ and placed in the same # directory as this file. entity_def_files = ("xhtml-lat1.ent", "xhtml-special.ent", "xhtml-symbol.ent") ############################################################################## class MungeError(Exception): """Something went seriously wrong.""" pass # Part one of the HTML tidying process mirrors the function of # mod_virgule's nice_htext(). It is basically a very liberal HTML # parser. format_tags = block_tags + list_tags + flow_tags allowed_tags = format_tags + inline_tags entity_or_ref_re = re.compile(r"^(#(\d+|x[\dA-Fa-f]+)|[A-Za-z0-9]+);(.*)", re.S) tag_re = re.compile(r"""^(/)?(%s)((\s+\w+\s*=(("[^"]*")|('[^']*')))*\s*)(/)?>(.*)$""" % \ string.join(allowed_tags, "|"), re.S) # ' doh emacs! attr_re = re.compile(r"""^\s+(\w+)\s*=(("[^"]*")|('[^']*'))(.*)$""", re.S) # ' def saxParseHtml(cb, text, convert_hex_refs = 0): """Turn some html-ish text into SAX events, dealing with unquoted special characters and inserting paragraph tags wherever there is a blank line (except within a pre). It does basically what mod_virgule's nice_htext() function does, albeit with more regular expressions and less fuss.""" buf = "" stack = [] cb.startDocument() while text: c = text[0] text = text[1:] if c == "\n": if buf and buf[-1] == "\n" and not "pre" in stack: cb.characters(buf) cb.startElement("p", {}) cb.endElement("p") cb.characters("\n") buf = "" else: buf = buf + "\n" elif c == "&": m = entity_or_ref_re.search(text) if m: ent, text = m.group(1, 3) buf = buf + rewriteEntityOrRef(ent, convert_hex_refs) else: buf = buf + "&" elif c == "<": m = tag_re.search(text) if m: preslash, tag, attrs, postslash, rest = m.group(1, 2, 3, 8, 9) if preslash and (attrs or postslash): # This end tag is malformed buf = buf + "<" else: # Clear any unterminated

    and

  • tags if preslash: if tag in block_tags: while stack[-1] == "p": cb.endElement("p") del stack[-1] if tag in ("ol", "ul") and stack[-1] == "li": if buf: cb.characters(buf) buf = "" cb.endElement("li") del stack[-1] elif tag == "li": for i in range(len(stack) - 1, -1, -1): if stack[i] in ("li", "ol", "ul"): break if stack[i] != "p": i = None break if i is not None and stack[i] == "li": if buf: cb.characters(buf) buf = "" for i in range(len(stack) - 1, i - 1, -1): cb.endElement(stack[-1]) del stack[-1] if preslash and (stack == [] or tag != stack[-1]): # This end tag is misplaced buf = buf + "<" else: if buf: cb.characters(buf) buf = "" if not preslash: a = {} while attrs: m = attr_re.search(attrs) if not m: raise MungeError, "attrs='%s'" % attrs name, value, attrs = m.group(1, 2, 5) value = value[1:-1] esc_val = "" while value: c = value[0] value = value[1:] if c == "&": m = entity_or_ref_re.search(value) if m: ent, value = m.group(1, 3) esc_val = esc_val + \ rewriteEntityOrRef( ent, convert_hex_refs) else: esc_val = esc_val + "&" else: esc_val = esc_val + c a[name] = esc_val cb.startElement(tag, a) if postslash or tag in empty_tags: cb.endElement(tag) else: stack.append(tag) else: cb.endElement(tag) del stack[-1] # checked earlier text = rest elif text[:2] == "/>" and stack: # SGML-style closing tag text = text[2:] if buf: cb.characters(buf) buf = "" cb.endElement(stack[-1]) del stack[-1] else: buf = buf + "<" elif c == ">": buf = buf + ">" elif c != "\r": buf = buf + c if buf: cb.characters(buf) if stack: # End tags for these were omitted stack.reverse() for tag in stack: cb.endElement(tag) cb.endDocument() # Part two of the HTML tidying process is what was the HTMLDigester # class from my old diary munger. It converts a stream of SAX events # with misplaced

    elements into a stream of SAX elements with # correctly placed

    elements. class ParaTidyCallback: """SAX callback to correct the placement of

    elements.""" def __init__(self, cb): self.cb = cb def startDocument(self): self.stack = [None] self.buf = "" self.cb.startDocument() def endDocument(self): self.flush_text_buffer() if self.stack == [None, "p"]: self.endElement("p") if self.stack != [None]: raise MungeError, "stack=%s" % self.stack del self.stack del self.buf self.cb.endDocument() def startElement(self, tag, attrs): if tag in format_tags: self.push_format_tag(tag, attrs) else: self.push_inline_tag(tag, attrs) def endElement(self, tag): if tag in format_tags: self.pop_format_tag(tag) else: self.pop_inline_tag(tag) def characters(self, data): self.push_text(data) def push_text(self, text): current = self.stack[-1] if current in (None,) + block_tags: if string.strip(text): self.real_push_format_tag("p", {}) elif current in list_tags: if string.strip(text): raise MungeError, "'%s' in %s" % (text, current) elif current in flow_tags: pass else: raise MungeError, "unknown element %s in stack" % current self.buf = self.buf + text def flush_text_buffer(self): if self.buf: current = self.stack[-1] if current in block_tags + list_tags: if string.strip(self.buf): raise MungeError, "'%s' in %s" % (self.buf, current) else: self.cb.characters(self.buf) self.buf = "" def push_inline_tag(self, tag, attrs): self.flush_text_buffer() current = self.stack[-1] if current in (None,) + block_tags: self.real_push_format_tag("p", {}) elif current in list_tags: raise MungeError, "'<%s>' in %s" % (tag, current) elif current in flow_tags: pass else: raise MungeError, "unknown element %s in stack" % current self.cb.startElement(tag, attrs) def pop_inline_tag(self, tag): self.flush_text_buffer() current = self.stack[-1] if current in block_tags + list_tags: raise MungeError, "'' in %s" % (tag, current) self.cb.endElement(tag) def push_format_tag(self, tag, attrs): current = self.stack[-1] if current == "p": if tag in format_tags: self.pop_format_tag("p") else: raise MungeError, "unknown tag %s in stack" % tag if tag != "p": self.real_push_format_tag(tag, attrs) def pop_format_tag(self, tag): current = self.stack[-1] if current == "p" and tag != "p": self.real_pop_format_tag("p") self.real_pop_format_tag(tag) elif current != "p" and tag == "p": pass elif current != tag: raise MungeError, "mismatched tags (<%s> and )"%(current,tag) else: self.real_pop_format_tag(tag) def real_push_format_tag(self, tag, attrs): self.flush_text_buffer() self.cb.startElement(tag, attrs) self.stack.append(tag) def real_pop_format_tag(self, tag): self.flush_text_buffer() self.cb.endElement(tag) del self.stack[-1] # Finally, the SAX events are serialized into XML class SerializeCallback: """SAX callback to serialize a stream of events. Empty tags are collapsed in an XML-stylee.""" def startDocument(self): self.xml = "" self.waiting = None def endDocument(self): self.flushWait() del self.waiting def flushWait(self, empty = 0): if self.waiting: tag, attrs = self.waiting self.xml = self.xml + "<" + tag for attr in attrs.keys(): self.xml = self.xml + ' %s="%s"' % (attr, attrs[attr]) if empty: self.xml = self.xml + "/" self.xml = self.xml + ">" self.waiting = None def startElement(self, tag, attrs): self.flushWait() self.waiting = (tag, attrs) def endElement(self, tag): if self.waiting and self.waiting[0] == tag: self.flushWait(1) else: self.flushWait() self.xml = self.xml + "" % tag def characters(self, data): self.flushWait() self.xml = self.xml + data # Wrapper function for the two stages above def htmlTidy(html): """Convert some messy HTML into XHTML 1.0, inserting new paragraphs wherever there are blank lines along the way. Multiple invocations should not change the text at all, though this isn't the case if tags are in the wrong order.""" serializer = SerializeCallback() saxParseHtml(ParaTidyCallback(serializer), html) return serializer.xml # Code to convert entities to character references, and optionally to # convert hexadecimal character references to decimal ones class EntitySetParser(sgmllib.SGMLParser): """A class which extracts the entity declarations from a DTD.""" entity_decl_re = re.compile(r'^ENTITY\s+([A-Za-z0-9]+)\s+"&#(\d+);"$') def __init__(self): self.entities = {} sgmllib.SGMLParser.__init__(self) def unknown_decl(self, data): m = self.entity_decl_re.search(data) if m: self.entities[m.group(1)] = int(m.group(2)) def rewriteEntityOrRef(ent, convert_hex = 0): """Convert entities to character references, and optionally convert hexadecimal character references to decimal ones.""" global entities if ent[0] != "#" and ent not in allowed_entities: if entities is None: entities = {} try: path = os.path.split(__file__)[0] except NameError: path = "" for file in entity_def_files: parser = EntitySetParser() parser.feed(open(os.path.join(path, file), "r").read()) parser.close() entities.update(parser.entities) if entities.has_key(ent): ent = "#%d" % entities[ent] else: ent = "amp;" + ent elif convert_hex and ent[1] == "x": ent = "#%d" % string.atoi(ent[2:], 16) return "&" + ent + ";" entities = None if __name__ == "__main__": # Test code import sys before = sys.stdin.read() print repr(before) print "=====" after = htmlTidy(before) print repr(after) after2 = htmlTidy(after) if after != after2: print "=====" print repr(after2)