## jpegsax.py, v0.1 ## Copyright (C) 2004 Gary Benson ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. """SAX-like parsing of JPEG files. Use it like this: import jpegsax parser = jpegsax.JPEGParser() parser.setHandler(handler) parser.parse(open('foo.jpg', 'r')) This contains one handler, JPEGSerializer, which is the inverse of JPEGParser: it serializes an event stream to a file object. """ class JPEGParser: class error(Exception): pass def setHandler(self, handler): self.handle = handler def parse(self, fp): while True: marker = self.__int16(fp.read(2)) if marker < 0xFFC0 or marker > 0xFFFE: raise self.error, "Bad marker: 0x%04X" % marker if marker & 0xF0 == 0xC0 and marker != 0xFFC8 or \ marker & 0xF0 == 0xD0 and marker >= 0xFFDA or \ marker & 0xF0 == 0xE0 or \ marker == 0xFFFE: size = self.__int16(fp.read(2)) - 2 data = fp.read(size) if len(data) != size: assert len(data) < size raise self.error, "Unexpected end of file" else: data = None self.handle(marker, data) if marker == 0xFFDA: # Start of Scan data = fp.read() marker = self.__int16(data[-2:]) if marker != 0xFFD9: # End of Image raise self.error, "Unexpected end of file" self.handle(None, data[:-2]) self.handle(marker, None) break def __int16(self, s): if len(s) != 2: assert len(s) < 2 raise self.error, "Unexpected end of file" return (ord(s[0]) << 8) + ord(s[1]) class JPEGSerializer: def __init__(self, fp): self.fp = fp def __call__(self, marker, data): if marker is not None: self.fp.write(self.__int16(marker)) if data is not None: self.fp.write(self.__int16(len(data) + 2)) self.fp.write(data) else: assert data is not None self.fp.write(data) def __int16(self, i): assert i >= 0x0000 and i <= 0xFFFF return chr(i >> 8) + chr(i & 0xFF) if __name__ == "__main__": import cStringIO as StringIO, sys, os parser = JPEGParser() for file in os.popen("find ~ -name '*.jpg' -or -name '*.jpeg'"): file = file.rstrip() src = open(file, "r").read() if not src or ord(src[0]) != 0xFF: print >>sys.stderr, "skipping non-JPEG file %s" % file continue src = StringIO.StringIO(src) dst = StringIO.StringIO() parser.setHandler(JPEGSerializer(dst)) try: parser.parse(src) except JPEGParser.error, e: if e == "Unexpected end of file": print >>sys.stderr, "skipping truncated JPEG file %s" % file else: print >>sys.stderr, "skipping malformed JPEG file %s" % file continue src.seek(0); src = src.read() dst.seek(0); dst = dst.read() print file + ":", dst == src and "ok" or "NOT OK"