1. Victor Stinner
  2. hachoir


hachoir / hachoir-tools / hachoir-strip

Binary striper: remove metadata, producer information, useless padding, etc.
from binary files.

Author: Victor Stinner
Creation: 17 september 2006

from hachoir_core.field import MissingField
from hachoir_core.cmd_line import unicodeFilename
from hachoir_editor import (createEditor,
    NewFieldSet, EditableInteger, EditableBytes)
from hachoir_core.stream import FileOutputStream, StringOutputStream
from hachoir_core.tools import humanFilesize, makePrintable
from hachoir_core.i18n import _, getTerminalCharset
from hachoir_parser import createParser
from optparse import OptionGroup, OptionParser
import hachoir_core
import sys

# File formats
from hachoir_parser.archive import TarFile
from hachoir_parser.audio import AuFile
from hachoir_parser.image import PngFile, JpegFile
from hachoir_parser.container import RiffFile
from hachoir_parser.audio import MpegAudioFile

# Constants
__version__ = "0.1"
WEBSITE = "%s/wiki/hachoir-strip" % hachoir_core.WEBSITE

# Strip what?
STRIP_USELESS  = 0x01   # Useless padding, useless duplicate information, etc.
STRIP_METADATA = 0x02   # Timestamp, author, producter, comment, etc.
STRIP_INDEX    = 0x04   # Index in video

class BasicStripper:
    def __init__(self, editor, level, verbose=False):
        self.editor = editor
        self.level = level
        self.verbose = verbose
        self._is_dirty = False
        self._removed_bytes = 0

    def __call__(self):
        self.stripped_bytes = self.strip()
        if self.stripped_bytes:
            self._is_dirty = True
        return self._is_dirty

    def info(self, message):
        if self.verbose:
            print message

    def strip(self):
        Strip input editor. Returns number of remove bits.
        raise NotImplementedError()

    def removeField(self, field, editor=None):
        if not editor:
            editor = self.editor
        self.info("Remove field %s" % field.path)
        size = field.size
        del editor[field.name]
        return size

    def removeFields(self, fields, editor=None):
        if not editor:
            editor = self.editor
        return sum(self.removeField(field, editor) for field in fields)

class CheckStripper(BasicStripper):
    def checkField(self, field):
        Check if a field of the root have to be removed or not.
        Returns boolean.
        raise NotImplementedError()

    def strip(self):
        fields = (field for field in self.editor if self.checkField(field))
        return self.removeFields(fields)

class PngStripper(CheckStripper):
    def checkField(self, field):
        if self.level & STRIP_METADATA:
            name = field.name
            if name.startswith("text["):
                return True
            if name == "time":
                return True
        return False

class JpegStripper(CheckStripper):
    def checkField(self, field):
        if self.level & STRIP_METADATA:
            if field.name.startswith("comment"):
                return True
            return field.name in ("photoshop", "exif", "adobe")
        return False

class MpegAudioStripper(CheckStripper):
    def checkField(self, field):
        if self.level & STRIP_METADATA:
            return field.name in ("id3v1", "id3v2")
        return False

class AuStripper(BasicStripper):
    def strip(self):
        if self.level & STRIP_METADATA and "info" in self.editor:
            size = self.editor["info"].size
            self.editor["data_ofs"].value -= (size // 8)
            return self.removeField(self.editor["info"])
            return 0

class RiffStripper(BasicStripper):
    def stripSub(self, editor, names):
        remove = []
        total = 0
        for field in editor:
            if field.name in names:
                tag = field["tag"].value
            except MissingField:
            if self.level & STRIP_USELESS and tag in "JUNK":
            if tag == "LIST" and field.name != "movie":
                # Strip a chunk list
                size = self.stripSub(field, names)
                if size:
                    # Update chunk list size
                    field["size"].value -= (size // 8)
                total += size
        total += self.removeFields(remove, editor)
        return total

    def strip(self):
        names = []
        if self.level & STRIP_USELESS:
        if self.level & STRIP_METADATA:
        if self.level & STRIP_INDEX:
            if "/headers/avi_hdr/has_index" in self.editor:
                self.editor["/headers/avi_hdr/has_index"].value = False
        size = self.stripSub(self.editor, names)
        if size:
            # Update file size field
            self.editor["filesize"].value -= (size // 8)
        return size

class MpegAudioStripper(CheckStripper):
    def checkField(self, field):
        if self.level & STRIP_METADATA:
            return field.name in ("id3v1", "id3v2")
        return False

def usage():
    print "Usage: %s filename" % sys.argv[0]

class TarStripper(BasicStripper):
    def strip(self):
        for file in self.editor.array("file"):

    def fixChecksum(self, file):
        file["check_sum"].value = " " * 8
        stream = StringOutputStream()
        data = stream.readBytes(0, 512)
        checksum = sum( ord(character) for character in data )
        file["check_sum"].value = ("0%o\0" % checksum).ljust(8, " ")

    def stripFile(self, file):
        empty32 = "\0" * 32
        uid = "0000000\0"
        file["uid"].value = uid
        file["gid"].value = uid
        file["mtime"].value = "00000000000\0"
        file["uname"].value = empty32
        file["gname"].value = empty32
        self._is_dirty = True

strippers = {
    AuFile: AuStripper,
    RiffFile: RiffStripper,
    PngFile: PngStripper,
    JpegFile: JpegStripper,
    MpegAudioFile: MpegAudioStripper,
    TarFile: TarStripper,

def stripEditor(editor, filename, level, verbose):
    cls = editor.input.__class__
        stripper_cls = strippers[cls]
    except KeyError:
        print "Don't have stripper for file type: %s" % editor.description
        return False
    stripper = stripper_cls(editor, level, verbose)

    if stripper():
        filename, realname = unicodeFilename(filename), filename
        output = FileOutputStream(filename, real_filename=realname)
        size = stripper.stripped_bytes
        if size:
            percent = "%.1f%%" % (float(size) * 100 / editor.input.size)
            if size%8 and size < 128:
                print _("Remove %u.%u bytes (%s)") \
                    % (size//8, size%8, percent)
                print _("Remove %s (%s)") % (humanFilesize(size//8), percent)
        print _("Save new file into %s") % \
            makePrintable(filename, getTerminalCharset(), to_unicode=True)
        print _("Stripper doesn't touch the file")
    return True

def displayVersion(*args):
    print _("Hachoir strip version %s") % __version__
    print _("Hachoir library version %s") % hachoir_core.__version__
    print _("Website: %s") % WEBSITE

def parseOptions():
    parser = OptionParser(usage="%prog [options] filename")

    common = OptionGroup(parser, "Hachoir strip")
    common.add_option("--strip", help=_("Data types to remove: useless, metadata, index (default: all). Use comma to specify two or more."),
        type="str", action="store", default="metadata, useless, index")
    common.add_option("--quiet", help=_("Be quiet"),
        action="store_true", default=False)
    common.add_option("--version", help=_("Display version and exit"),
        action="callback", callback=displayVersion)

    values, arguments = parser.parse_args()
    if not arguments:
    return values, arguments

def main():
    # Parse arguments and read filenames
    values, filenames = parseOptions()
    level = 0
    for item in values.strip.split(","):
        item = item.strip()
        if item == "useless":
            level |= STRIP_USELESS
        elif item == "metadata":
            level |= STRIP_METADATA
        elif item == "index":
            level |= STRIP_INDEX
    if not level:
        print _("Nothing to do, exit")
    ok = True
    charset = getTerminalCharset()
    for filename in filenames:
        filename, real_filename = unicodeFilename(filename), filename
        print "[+]", (_("Process file %s") % filename)
        parser = createParser(filename, real_filename=real_filename)
        if parser:
            editor = createEditor(parser)
            ok &= stripEditor(editor, real_filename+".new", level, not(values.quiet))
            ok = False
    if ok:
if __name__ == "__main__":