## HTMLTidy.py, v0.3
## Copyright (C) 2002, 2003, 2004 Gary Benson <gary@inauspicious.org>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

"""Ultra-liberal HTML to XHTML convertor.

Use it like this:

  import HTMLTidy
  xhtml = HTMLTidy.htmlTidy(html)

It is designed to always produce some kind of output even for the most
bizarrely malformed input, and that multiple passes should not change
the data (ie that htmlTidy(x) == htmlTidy(htmlTidy(x)) for all x).  If
it fails on either of these counts then I'd like to know about it.

In addition to the normal HTML conventions blank lines are considered
as new paragraphs except within <pre> blocks, and the characters '<',
'>' and '&' will be treated as text unless in context.  The SGML
convention of using '</>' to close the last open tag is also
supported.

It's response to transposed tags (like '<a>foo<b>bar</a></b>') leaves
something to be desired.
"""

import os
import re
import sgmllib
import string

##############################################################################

# Inline tags are those that may be freely mixed with text (well,
# CDATA. Whatever...).  In this file we refer to a mixture of inline
# tags and text as inline data.
inline_tags = ("a", "code", "em", "strong", "img", "br", "span",
               "sub", "sup", "input")

# Flow tags may contain inline data and only inline data.
flow_tags = ("p", "pre")

# List tags contain only block tags.  (Actually, they should contain
# only <li> tags but we don't check for this.)
list_tags = ("ol", "ul")

# Block tags may contain other block tags, list tags, and inline data
# (which will have <p> tags inserted around it (TODO don't insert <p>
# tags when not necessary; perhaps another callback to remove <p> tags
# when a block tag contains only one child)).
block_tags = ("blockquote", "li", "div", "form", "table", "tr", "td")

# The following tags are always empty, and the trailing slash may be
# omitted.
empty_tags = ("img", "br", "input")

# All entities not in this list will be converted to character references.
allowed_entities = ("quot", "amp", "lt", "gt", "apos")

# Files containing entity definitions.  These should be downloaded
# from http://www.w3.org/TR/xhtml1/DTD/ and placed in the same
# directory as this file.
entity_def_files = ("xhtml-lat1.ent", "xhtml-special.ent", "xhtml-symbol.ent")

##############################################################################

class MungeError(Exception):
    """Something went seriously wrong."""
    pass

# Part one of the HTML tidying process mirrors the function of
# mod_virgule's nice_htext().  It is basically a very liberal HTML
# parser.

format_tags = block_tags + list_tags + flow_tags
allowed_tags = format_tags + inline_tags

entity_or_ref_re = re.compile(r"^(#(\d+|x[\dA-Fa-f]+)|[A-Za-z0-9]+);(.*)", re.S)
tag_re = re.compile(r"""^(/)?(%s)((\s+\w+\s*=(("[^"]*")|('[^']*')))*\s*)(/)?>(.*)$""" % \
                    string.join(allowed_tags, "|"), re.S) # ' doh emacs!
attr_re = re.compile(r"""^\s+(\w+)\s*=(("[^"]*")|('[^']*'))(.*)$""", re.S) # '

def saxParseHtml(cb, text, convert_hex_refs = 0):
    """Turn some html-ish text into SAX events, dealing with unquoted
    special characters and inserting paragraph tags wherever there is
    a blank line (except within a pre).  It does basically what
    mod_virgule's nice_htext() function does, albeit with more regular
    expressions and less fuss."""

    buf = ""
    stack = []
    cb.startDocument()
    while text:
        c = text[0]
        text = text[1:]

        if c == "\n":
            if buf and buf[-1] == "\n" and not "pre" in stack:
                cb.characters(buf)
                cb.startElement("p", {})
                cb.endElement("p")
                cb.characters("\n")
                buf = ""
            else:
                buf = buf + "\n"
        elif c == "&":
            m = entity_or_ref_re.search(text)
            if m:
                ent, text = m.group(1, 3)
                buf = buf + rewriteEntityOrRef(ent, convert_hex_refs)
            else:
                buf = buf + "&amp;"
        elif c == "<":
            m = tag_re.search(text)
            if m:
                preslash, tag, attrs, postslash, rest = m.group(1, 2, 3, 8, 9)
                if preslash and (attrs or postslash):
                    # This end tag is malformed
                    buf = buf + "&lt;"
                else:
                    # Clear any unterminated <p> and <li> tags
                    if preslash:
                        if tag in block_tags:
                             while stack[-1] == "p":
                                cb.endElement("p")
                                del stack[-1]
                        if tag in ("ol", "ul") and stack[-1] == "li":
                            if buf:
                                cb.characters(buf)
                                buf = ""
                            cb.endElement("li")
                            del stack[-1]
                    elif tag == "li":
                        for i in range(len(stack) - 1, -1, -1):
                            if stack[i] in ("li", "ol", "ul"):
                                break
                            if stack[i] != "p":
                                i = None
                                break
                        if i is not None and stack[i] == "li":
                            if buf:
                                cb.characters(buf)
                                buf = ""
                            for i in range(len(stack) - 1, i - 1, -1):
                                cb.endElement(stack[-1])
                                del stack[-1]

                    if preslash and (stack == [] or tag != stack[-1]):
                        # This end tag is misplaced
                        buf = buf + "&lt;"
                    else:
                        if buf:
                            cb.characters(buf)
                            buf = ""
                        if not preslash:
                            a = {}
                            while attrs:
                                m = attr_re.search(attrs)
                                if not m:
                                    raise MungeError, "attrs='%s'" % attrs
                                name, value, attrs = m.group(1, 2, 5)
                                value = value[1:-1]
                                esc_val = ""
                                while value:
                                    c = value[0]
                                    value = value[1:]
                                    if c == "&":
                                        m = entity_or_ref_re.search(value)
                                        if m:
                                            ent, value = m.group(1, 3)
                                            esc_val = esc_val + \
                                                rewriteEntityOrRef(
                                                    ent, convert_hex_refs)
                                        else:
                                            esc_val = esc_val + "&amp;"
                                    else:
                                        esc_val = esc_val + c
                                a[name] = esc_val
                            cb.startElement(tag, a)
                            if postslash or tag in empty_tags:
                                cb.endElement(tag)
                            else:
                                stack.append(tag)
                        else:
                            cb.endElement(tag)
                            del stack[-1] # checked earlier
                        text = rest
            elif text[:2] == "/>" and stack:
                # SGML-style </> closing tag
                text = text[2:]
                if buf:
                    cb.characters(buf)
                    buf = ""
                cb.endElement(stack[-1])
                del stack[-1]
            else:
                buf = buf + "&lt;"
        elif c == ">":
            buf = buf + "&gt;"
        elif c != "\r":
            buf = buf + c
    if buf:
        cb.characters(buf)
    if stack:
        # End tags for these were omitted
        stack.reverse()
        for tag in stack:
            cb.endElement(tag)
    cb.endDocument()


# Part two of the HTML tidying process is what was the HTMLDigester
# class from my old diary munger.  It converts a stream of SAX events
# with misplaced <p> elements into a stream of SAX elements with
# correctly placed <p> elements.

class ParaTidyCallback:
    """SAX callback to correct the placement of <p> elements."""

    def __init__(self, cb):
        self.cb = cb

    def startDocument(self):
        self.stack = [None]
        self.buf = ""
        self.cb.startDocument()

    def endDocument(self):
        self.flush_text_buffer()
        if self.stack == [None, "p"]:
            self.endElement("p")
        if self.stack != [None]:
            raise MungeError, "stack=%s" % self.stack
        del self.stack
        del self.buf
        self.cb.endDocument()

    def startElement(self, tag, attrs):
        if tag in format_tags:
            self.push_format_tag(tag, attrs)
        else:
            self.push_inline_tag(tag, attrs)

    def endElement(self, tag):
        if tag in format_tags:
            self.pop_format_tag(tag)
        else:
            self.pop_inline_tag(tag)

    def characters(self, data):
        self.push_text(data)

    def push_text(self, text):
        current = self.stack[-1]
        if current in (None,) + block_tags:
            if string.strip(text):
                self.real_push_format_tag("p", {})
        elif current in list_tags:
            if string.strip(text):
                raise MungeError, "'%s' in %s" % (text, current)
        elif current in flow_tags:
            pass
        else:
            raise MungeError, "unknown element %s in stack" % current

        self.buf = self.buf + text

    def flush_text_buffer(self):
        if self.buf:
            current = self.stack[-1]
            if current in block_tags + list_tags:
                if string.strip(self.buf):
                    raise MungeError, "'%s' in %s" % (self.buf, current)
            else:
                self.cb.characters(self.buf)
            self.buf = ""

    def push_inline_tag(self, tag, attrs):
        self.flush_text_buffer()
        current = self.stack[-1]
        if current in (None,) + block_tags:
            self.real_push_format_tag("p", {})
        elif current in list_tags:
            raise MungeError, "'<%s>' in %s" % (tag, current)
        elif current in flow_tags:
            pass
        else:
            raise MungeError, "unknown element %s in stack" % current
        self.cb.startElement(tag, attrs)

    def pop_inline_tag(self, tag):
        self.flush_text_buffer()
        current = self.stack[-1]
        if current in block_tags + list_tags:
            raise MungeError, "'</%s>' in %s" % (tag, current)
        self.cb.endElement(tag)

    def push_format_tag(self, tag, attrs):
        current = self.stack[-1]
        if current == "p":
            if tag in format_tags:
                self.pop_format_tag("p")
            else:
                raise MungeError, "unknown tag %s in stack" % tag

        if tag != "p":
            self.real_push_format_tag(tag, attrs)

    def pop_format_tag(self, tag):
        current = self.stack[-1]
        if current == "p" and tag != "p":
            self.real_pop_format_tag("p")
            self.real_pop_format_tag(tag)
        elif current != "p" and tag == "p":
            pass
        elif current != tag:
            raise MungeError, "mismatched tags (<%s> and </%s>)"%(current,tag)
        else:
            self.real_pop_format_tag(tag)

    def real_push_format_tag(self, tag, attrs):
        self.flush_text_buffer()
        self.cb.startElement(tag, attrs)
        self.stack.append(tag)

    def real_pop_format_tag(self, tag):
        self.flush_text_buffer()
        self.cb.endElement(tag)
        del self.stack[-1]


# Finally, the SAX events are serialized into XML

class SerializeCallback:
    """SAX callback to serialize a stream of events.  Empty tags are
    collapsed in an XML-stylee."""

    def startDocument(self):
        self.xml = ""
        self.waiting = None

    def endDocument(self):
        self.flushWait()
        del self.waiting

    def flushWait(self, empty = 0):
        if self.waiting:
            tag, attrs = self.waiting
            self.xml = self.xml + "<" + tag
            for attr in attrs.keys():
                self.xml = self.xml + ' %s="%s"' % (attr, attrs[attr])
            if empty:
                self.xml = self.xml + "/"
            self.xml = self.xml + ">"
            self.waiting = None

    def startElement(self, tag, attrs):
        self.flushWait()
        self.waiting = (tag, attrs)

    def endElement(self, tag):
        if self.waiting and self.waiting[0] == tag:
            self.flushWait(1)
        else:
            self.flushWait()
            self.xml = self.xml + "</%s>" % tag

    def characters(self, data):
        self.flushWait()
        self.xml = self.xml + data


# Wrapper function for the two stages above

def htmlTidy(html):
    """Convert some messy HTML into XHTML 1.0, inserting new
    paragraphs wherever there are blank lines along the way.  Multiple
    invocations should not change the text at all, though this isn't
    the case if tags are in the wrong order."""
    serializer = SerializeCallback()
    saxParseHtml(ParaTidyCallback(serializer), html)
    return serializer.xml


# Code to convert entities to character references, and optionally to
# convert hexadecimal character references to decimal ones

class EntitySetParser(sgmllib.SGMLParser):
    """A class which extracts the entity declarations from a DTD."""

    entity_decl_re = re.compile(r'^ENTITY\s+([A-Za-z0-9]+)\s+"&#(\d+);"$')

    def __init__(self):
        self.entities = {}
        sgmllib.SGMLParser.__init__(self)

    def unknown_decl(self, data):
        m = self.entity_decl_re.search(data)
        if m:
            self.entities[m.group(1)] = int(m.group(2))

def rewriteEntityOrRef(ent, convert_hex = 0):
    """Convert entities to character references, and optionally
    convert hexadecimal character references to decimal ones."""
    global entities
    if ent[0] != "#" and ent not in allowed_entities:
        if entities is None:
            entities = {}
            try:
                path = os.path.split(__file__)[0]
            except NameError:
                path = ""
            for file in entity_def_files:
                parser = EntitySetParser()
                parser.feed(open(os.path.join(path, file), "r").read())
                parser.close()
                entities.update(parser.entities)
        if entities.has_key(ent):
            ent = "#%d" % entities[ent]
        else:
            ent = "amp;" + ent
    elif convert_hex and ent[1] == "x":
        ent = "#%d" % string.atoi(ent[2:], 16)
    return "&" + ent + ";"

entities = None


if __name__ == "__main__":
    # Test code
    import sys
    before = sys.stdin.read()
    print repr(before)
    print "====="
    after = htmlTidy(before)
    print repr(after)
    after2 = htmlTidy(after)
    if after != after2:
        print "====="
        print repr(after2)