blocks, and the characters '<', '>' and '&' will be treated as text unless in context. The SGML convention of using '>' to close the last open tag is also supported. It's response to transposed tags (like 'foobar') leaves something to be desired. """ import os import re import sgmllib import string ############################################################################## # Inline tags are those that may be freely mixed with text (well, # CDATA. Whatever...). In this file we refer to a mixture of inline # tags and text as inline data. inline_tags = ("a", "code", "em", "strong", "img", "br", "span", "sub", "sup", "input") # Flow tags may contain inline data and only inline data. flow_tags = ("p", "pre") # List tags contain only block tags. (Actually, they should contain # only
tags inserted around it (TODO don't insert
# tags when not necessary; perhaps another callback to remove
tags # when a block tag contains only one child)). block_tags = ("blockquote", "li", "div", "form", "table", "tr", "td") # The following tags are always empty, and the trailing slash may be # omitted. empty_tags = ("img", "br", "input") # All entities not in this list will be converted to character references. allowed_entities = ("quot", "amp", "lt", "gt", "apos") # Files containing entity definitions. These should be downloaded # from http://www.w3.org/TR/xhtml1/DTD/ and placed in the same # directory as this file. entity_def_files = ("xhtml-lat1.ent", "xhtml-special.ent", "xhtml-symbol.ent") ############################################################################## class MungeError(Exception): """Something went seriously wrong.""" pass # Part one of the HTML tidying process mirrors the function of # mod_virgule's nice_htext(). It is basically a very liberal HTML # parser. format_tags = block_tags + list_tags + flow_tags allowed_tags = format_tags + inline_tags entity_or_ref_re = re.compile(r"^(#(\d+|x[\dA-Fa-f]+)|[A-Za-z0-9]+);(.*)", re.S) tag_re = re.compile(r"""^(/)?(%s)((\s+\w+\s*=(("[^"]*")|('[^']*')))*\s*)(/)?>(.*)$""" % \ string.join(allowed_tags, "|"), re.S) # ' doh emacs! attr_re = re.compile(r"""^\s+(\w+)\s*=(("[^"]*")|('[^']*'))(.*)$""", re.S) # ' def saxParseHtml(cb, text, convert_hex_refs = 0): """Turn some html-ish text into SAX events, dealing with unquoted special characters and inserting paragraph tags wherever there is a blank line (except within a pre). It does basically what mod_virgule's nice_htext() function does, albeit with more regular expressions and less fuss.""" buf = "" stack = [] cb.startDocument() while text: c = text[0] text = text[1:] if c == "\n": if buf and buf[-1] == "\n" and not "pre" in stack: cb.characters(buf) cb.startElement("p", {}) cb.endElement("p") cb.characters("\n") buf = "" else: buf = buf + "\n" elif c == "&": m = entity_or_ref_re.search(text) if m: ent, text = m.group(1, 3) buf = buf + rewriteEntityOrRef(ent, convert_hex_refs) else: buf = buf + "&" elif c == "<": m = tag_re.search(text) if m: preslash, tag, attrs, postslash, rest = m.group(1, 2, 3, 8, 9) if preslash and (attrs or postslash): # This end tag is malformed buf = buf + "<" else: # Clear any unterminated
and
elements into a stream of SAX elements with # correctly placed
elements. class ParaTidyCallback: """SAX callback to correct the placement of
elements.""" def __init__(self, cb): self.cb = cb def startDocument(self): self.stack = [None] self.buf = "" self.cb.startDocument() def endDocument(self): self.flush_text_buffer() if self.stack == [None, "p"]: self.endElement("p") if self.stack != [None]: raise MungeError, "stack=%s" % self.stack del self.stack del self.buf self.cb.endDocument() def startElement(self, tag, attrs): if tag in format_tags: self.push_format_tag(tag, attrs) else: self.push_inline_tag(tag, attrs) def endElement(self, tag): if tag in format_tags: self.pop_format_tag(tag) else: self.pop_inline_tag(tag) def characters(self, data): self.push_text(data) def push_text(self, text): current = self.stack[-1] if current in (None,) + block_tags: if string.strip(text): self.real_push_format_tag("p", {}) elif current in list_tags: if string.strip(text): raise MungeError, "'%s' in %s" % (text, current) elif current in flow_tags: pass else: raise MungeError, "unknown element %s in stack" % current self.buf = self.buf + text def flush_text_buffer(self): if self.buf: current = self.stack[-1] if current in block_tags + list_tags: if string.strip(self.buf): raise MungeError, "'%s' in %s" % (self.buf, current) else: self.cb.characters(self.buf) self.buf = "" def push_inline_tag(self, tag, attrs): self.flush_text_buffer() current = self.stack[-1] if current in (None,) + block_tags: self.real_push_format_tag("p", {}) elif current in list_tags: raise MungeError, "'<%s>' in %s" % (tag, current) elif current in flow_tags: pass else: raise MungeError, "unknown element %s in stack" % current self.cb.startElement(tag, attrs) def pop_inline_tag(self, tag): self.flush_text_buffer() current = self.stack[-1] if current in block_tags + list_tags: raise MungeError, "'%s>' in %s" % (tag, current) self.cb.endElement(tag) def push_format_tag(self, tag, attrs): current = self.stack[-1] if current == "p": if tag in format_tags: self.pop_format_tag("p") else: raise MungeError, "unknown tag %s in stack" % tag if tag != "p": self.real_push_format_tag(tag, attrs) def pop_format_tag(self, tag): current = self.stack[-1] if current == "p" and tag != "p": self.real_pop_format_tag("p") self.real_pop_format_tag(tag) elif current != "p" and tag == "p": pass elif current != tag: raise MungeError, "mismatched tags (<%s> and %s>)"%(current,tag) else: self.real_pop_format_tag(tag) def real_push_format_tag(self, tag, attrs): self.flush_text_buffer() self.cb.startElement(tag, attrs) self.stack.append(tag) def real_pop_format_tag(self, tag): self.flush_text_buffer() self.cb.endElement(tag) del self.stack[-1] # Finally, the SAX events are serialized into XML class SerializeCallback: """SAX callback to serialize a stream of events. Empty tags are collapsed in an XML-stylee.""" def startDocument(self): self.xml = "" self.waiting = None def endDocument(self): self.flushWait() del self.waiting def flushWait(self, empty = 0): if self.waiting: tag, attrs = self.waiting self.xml = self.xml + "<" + tag for attr in attrs.keys(): self.xml = self.xml + ' %s="%s"' % (attr, attrs[attr]) if empty: self.xml = self.xml + "/" self.xml = self.xml + ">" self.waiting = None def startElement(self, tag, attrs): self.flushWait() self.waiting = (tag, attrs) def endElement(self, tag): if self.waiting and self.waiting[0] == tag: self.flushWait(1) else: self.flushWait() self.xml = self.xml + "%s>" % tag def characters(self, data): self.flushWait() self.xml = self.xml + data # Wrapper function for the two stages above def htmlTidy(html): """Convert some messy HTML into XHTML 1.0, inserting new paragraphs wherever there are blank lines along the way. Multiple invocations should not change the text at all, though this isn't the case if tags are in the wrong order.""" serializer = SerializeCallback() saxParseHtml(ParaTidyCallback(serializer), html) return serializer.xml # Code to convert entities to character references, and optionally to # convert hexadecimal character references to decimal ones class EntitySetParser(sgmllib.SGMLParser): """A class which extracts the entity declarations from a DTD.""" entity_decl_re = re.compile(r'^ENTITY\s+([A-Za-z0-9]+)\s+"(\d+);"$') def __init__(self): self.entities = {} sgmllib.SGMLParser.__init__(self) def unknown_decl(self, data): m = self.entity_decl_re.search(data) if m: self.entities[m.group(1)] = int(m.group(2)) def rewriteEntityOrRef(ent, convert_hex = 0): """Convert entities to character references, and optionally convert hexadecimal character references to decimal ones.""" global entities if ent[0] != "#" and ent not in allowed_entities: if entities is None: entities = {} try: path = os.path.split(__file__)[0] except NameError: path = "" for file in entity_def_files: parser = EntitySetParser() parser.feed(open(os.path.join(path, file), "r").read()) parser.close() entities.update(parser.entities) if entities.has_key(ent): ent = "#%d" % entities[ent] else: ent = "amp;" + ent elif convert_hex and ent[1] == "x": ent = "#%d" % string.atoi(ent[2:], 16) return "&" + ent + ";" entities = None if __name__ == "__main__": # Test code import sys before = sys.stdin.read() print repr(before) print "=====" after = htmlTidy(before) print repr(after) after2 = htmlTidy(after) if after != after2: print "=====" print repr(after2)