Commits

Anonymous committed e759353

hachoir-subfile: mMerge all magic strings to one big regex,
so stream is only read once and one regex is faster than many strings

Comments (0)

Files changed (2)

hachoir-tools/hachoir-subfile

 from hachoir_core.tools import humanFilesize, humanDuration
 from hachoir_parser.guess import HachoirParserList, parseStream
 from optparse import OptionGroup, OptionParser
+from hachoir_subfile_regex import createRegex
 import sys
 from time import time
 from errno import EEXIST
 import os
+import re
 
 __version__ = "0.0.0"
 
 PROGRESS_UPDATE = 1.5   # Minimum number of second between two progress messages
 DATARATE_UPDATE = 1.0   # Time slice (in second) for datarate computation
 
+def inputStreamSearchRegex(stream, regex, start, end):
+    data = stream.readBytes(start, (end-start)//8)
+    return [ (match.group(0), start+match.start(0)*8)
+        for match in regex.finditer(data) ]
+
 def getTotalMemory():
     try:
         line = open('/proc/meminfo').readlines()[0].split()
             self.size = self.stream.size
         self.slice_size = SLICE_SIZE*8   # 64 KB (in bits)
         self.start_offset = offset*8
-        self.guess_stat = {}
         self.datarate = DataRate(self.start_offset)
         if directory:
             self.output = Output(directory)
             parser_list = hachoir_parser
 
         # Load parser magics
-        self.magics = []
+        magics = []
         for parser in parser_list:
             if "magic" in parser.tags:
                 for (magic, offset) in parser.tags["magic"]:
                         self.slice_size = offset + 8
                         error("Use slice size of %s because of '%s' parser magic offset" % (
                             (self.slice_size//8), parser.__name__))
-                    self.magics.append((magic, offset, parser))
-                    self.guess_stat[parser] = [0,0]
+                    magics.append((magic, offset, parser))
+
+        # Build regex
+        self.max_magic_len = max( len(magic) for magic in magics )
+        self.magics = {}
+        magic_strings = []
+        for magic, offset, parser in magics:
+            magic_strings.append(magic)
+            self.magics[magic] = (offset, parser)
+        regex = createRegex(magic_strings)
+        self.magic_regex = re.compile(regex)
 
     def main(self):
         """
             if self.verbose and self.next_progress <= time():
                 self.displayProgress()
             found = []
-            for magic in self.magics:
-                for parser in self.findMagic(self.current_offset, *magic):
-                    found.append(parser)
+            for parser in self.findMagic(self.current_offset):
+                found.append(parser)
             for offset, parser in sorted(found):
                 self.processParser(offset, parser)
             self.current_offset = min(self.current_offset + self.slice_size, self.size)
         print text
         self.next_progress = time() + PROGRESS_UPDATE
 
-    def findMagic(self, offset, magic_str, magic_offset, parser_cls):
+    def findMagic(self, offset):
         """
         Find all 'magic_str' strings in stream in offset interval:
           offset..(offset+self.slice_size).
         offset is beginning of a file (relative to stream begin), and not the
         position of the magic.
         """
-        offset += magic_offset
-        max_offset = offset + self.slice_size + 8 * (len(magic_str) - 1)
+        max_offset = offset + self.slice_size + 8 * (self.max_magic_len-1)
         max_offset = min(max_offset, self.size)
-        while True:
-            offset = self.stream.searchBytes(magic_str, offset, max_offset)
-            if offset is None:
-                return
+        for magic, offset in inputStreamSearchRegex(self.stream, self.magic_regex, offset, max_offset):
+            magic_offset, parser_cls = self.magics[magic]
             parser = self.guess(offset-magic_offset, parser_cls)
             if parser:
                 yield (offset-magic_offset, parser)
-            offset += 8
 
     def guess(self, offset, parser_cls):
         """
 
         Return the parser object, or None on failure.
         """
-        self.guess_stat[parser_cls][0] += 1
+        if offset < 0:
+            return None
         substream = InputSubStream(self.stream, offset)
         (parser, error) = parseStream(parser_cls, substream)
-        if parser:
-            self.guess_stat[parser_cls][1] += 1
         return parser
 
     def displayProgress(self):
 #    subfile.filter = metadataFilter
 
     # Search subfiles
-    ok = subfile.main()
-
-    # Display statistics (only in debug mode)
-    if subfile.debug:
-        subfile.magics.sort(key=lambda value: len(value[0]))
-        for magic, offset, parser in subfile.magics:
-            print "%s (%u): %s" % (repr(magic), len(magic), parser.__name__)
-        print "--"
-        data = [(parser, stat[0], stat[1]) for parser, stat in subfile.guess_stat.iteritems() ]
-        data.sort(key=lambda values: values[1])
-        for parser, hit, valid in data:
-            print "Parser %s: %s magic / %s valid" % (parser.__name__, hit, valid)
-
-    # Exit
-    if not ok:
+    if subfile.main():
+        sys.exit(0)
+    else:
         sys.exit(1)
 
 if __name__ == "__main__":

hachoir-tools/hachoir_subfile_regex.py

+import re
+
+def groupStrings(patterns):
+    """
+    Group string with the smaller prefix (with minimum length of 1 character).
+
+    >>> groupStrings( ["abc", "xyz", "abd"])
+    ({'ab': ['c', 'd']}, ['xyz'])
+    >>> groupStrings( ["abcd0", "abcd01", "abcd2"])
+    ({'abcd': ['0', '01', '2']}, [])
+    >>> groupStrings( ["abc", "abc"])
+    ({'abc': ['', '']}, [])
+
+    >>> groupStrings( ["ab", "abc", "deF", "def"])
+    ({'de': ['F', 'f'], 'ab': ['', 'c']}, [])
+    """
+    patterns = sorted(patterns)
+    nb_grouped, groups, tail = groupStringsLen(patterns, 1)
+    min_len = min( len(pattern) for pattern in patterns )
+    prefix_len = 2
+    while prefix_len <= min_len:
+        new_nb_grouped, new_groups, new_tail = groupStringsLen(patterns, prefix_len)
+        if new_nb_grouped < nb_grouped:
+            break
+        groups, tail = new_groups, new_tail
+        prefix_len += 1
+    return groups, tail
+
+def groupStringsLen(patterns, prefix_len):
+    """
+    Group string with the prefix of 'prefix_len' characters.
+    patterns is a list of sorted strings.
+    Returns (nb_grouped, groups, tail)
+
+    >>> groupStrings( ["abc", "abd", "xyz"])
+    ({'ab': ['c', 'd']}, ['xyz'])
+    >>> groupStrings( ["abcd0", "abcd01", "abcd2"])
+    ({'abcd': ['0', '01', '2']}, [])
+    >>> groupStrings( ["abc", "abc"])
+    ({'abc': ['', '']}, [])
+
+    >>> groupStrings( ["ab", "abc", "deF", "def"])
+    ({'de': ['F', 'f'], 'ab': ['', 'c']}, [])
+    """
+
+    groups = {}
+    tail = []
+    nb_grouped = 0
+    start = 0
+    while start < len(patterns):
+        prefix = patterns[start][:prefix_len]
+        index = start+1
+        count = 1
+        while index < len(patterns) and patterns[index].startswith(prefix):
+            count += 1
+            index += 1
+        if 1 < count:
+            groups[prefix] = [ patterns[index][prefix_len:] for index in xrange(start,start+count) ]
+            nb_grouped += count
+            start += count
+        else:
+            tail.append(patterns[start])
+            start += 1
+        #tail.extend( patterns[start+count:] )
+    return nb_grouped, groups, tail
+
+def createRegex(patterns):
+    r"""
+    Create fast regex to match all strings of 'patterns' list.
+
+    >>> print createRegex(["func(b)"])
+    func\(b\)
+    >>> print createRegex(["a", "b"])
+    [ab]
+    >>> print createRegex(["aa", "bb"])
+    (?:aa|bb)
+    >>> print createRegex(["name0", "name1"])
+    name[01]
+    >>> print createRegex(["name1201", "name1001", "name1202"])
+    name1(?:20[12]|001)
+    >>> print createRegex([".exe", ".ico"])
+    \.(?:exe|ico)
+    """
+    # Just one string?
+    if len(patterns) == 1:
+        return re.escape(patterns[0])
+
+    # All pattern are 1 character?
+    index = 0
+    while index < len(patterns):
+        if len(patterns[index]) != 1:
+            break
+        index += 1
+    if index == len(patterns):
+        regex = "".join(patterns)
+        return "[%s]" % regex
+
+    groups, tail = groupStrings(patterns)
+    regex = []
+    if groups:
+        for prefix, patterns in groups.iteritems():
+            regex.append(re.escape(prefix) + createRegex(patterns))
+    else:
+        tail = patterns
+    for pattern in tail:
+        regex.append(re.escape(pattern))
+    if 2 <= len(regex):
+        return "(?:%s)" % "|".join(regex)
+    else:
+        return regex[0]
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+import re
+
+def groupStrings(patterns):
+    """
+    Group string with the smaller prefix (with minimum length of 1 character).
+
+    >>> groupStrings( ["abc", "xyz", "abd"])
+    ({'ab': ['c', 'd']}, ['xyz'])
+    >>> groupStrings( ["abcd0", "abcd01", "abcd2"])
+    ({'abcd': ['0', '01', '2']}, [])
+    >>> groupStrings( ["abc", "abc"])
+    ({'abc': ['', '']}, [])
+
+    >>> groupStrings( ["ab", "abc", "deF", "def"])
+    ({'de': ['F', 'f'], 'ab': ['', 'c']}, [])
+    """
+    patterns = sorted(patterns)
+    nb_grouped, groups, tail = groupStringsLen(patterns, 1)
+    min_len = min( len(pattern) for pattern in patterns )
+    prefix_len = 2
+    while prefix_len <= min_len:
+        new_nb_grouped, new_groups, new_tail = groupStringsLen(patterns, prefix_len)
+        if new_nb_grouped < nb_grouped:
+            break
+        groups, tail = new_groups, new_tail
+        prefix_len += 1
+    return groups, tail
+
+def groupStringsLen(patterns, prefix_len):
+    """
+    Group string with the prefix of 'prefix_len' characters.
+    patterns is a list of sorted strings.
+    Returns (nb_grouped, groups, tail)
+
+    >>> groupStrings( ["abc", "abd", "xyz"])
+    ({'ab': ['c', 'd']}, ['xyz'])
+    >>> groupStrings( ["abcd0", "abcd01", "abcd2"])
+    ({'abcd': ['0', '01', '2']}, [])
+    >>> groupStrings( ["abc", "abc"])
+    ({'abc': ['', '']}, [])
+
+    >>> groupStrings( ["ab", "abc", "deF", "def"])
+    ({'de': ['F', 'f'], 'ab': ['', 'c']}, [])
+    """
+
+    groups = {}
+    tail = []
+    nb_grouped = 0
+    start = 0
+    while start < len(patterns):
+        prefix = patterns[start][:prefix_len]
+        index = start+1
+        count = 1
+        while index < len(patterns) and patterns[index].startswith(prefix):
+            count += 1
+            index += 1
+        if 1 < count:
+            groups[prefix] = [ patterns[index][prefix_len:] for index in xrange(start,start+count) ]
+            nb_grouped += count
+            start += count
+        else:
+            tail.append(patterns[start])
+            start += 1
+        #tail.extend( patterns[start+count:] )
+    return nb_grouped, groups, tail
+
+def createRegex(patterns):
+    r"""
+    Create fast regex to match all strings of 'patterns' list.
+
+    >>> print createRegex(["func(b)"])
+    func\(b\)
+    >>> print createRegex(["a", "b"])
+    [ab]
+    >>> print createRegex(["aa", "bb"])
+    (?:aa|bb)
+    >>> print createRegex(["name0", "name1"])
+    name[01]
+    >>> print createRegex(["name1201", "name1001", "name1202"])
+    name1(?:20[12]|001)
+    >>> print createRegex([".exe", ".ico"])
+    \.(?:exe|ico)
+    """
+    # Just one string?
+    if len(patterns) == 1:
+        return re.escape(patterns[0])
+
+    # All pattern are 1 character?
+    index = 0
+    while index < len(patterns):
+        if len(patterns[index]) != 1:
+            break
+        index += 1
+    if index == len(patterns):
+        regex = "".join(patterns)
+        return "[%s]" % regex
+
+    groups, tail = groupStrings(patterns)
+    regex = []
+    if groups:
+        for prefix, patterns in groups.iteritems():
+            regex.append(re.escape(prefix) + createRegex(patterns))
+    else:
+        tail = patterns
+    for pattern in tail:
+        regex.append(re.escape(pattern))
+    if 2 <= len(regex):
+        return "(?:%s)" % "|".join(regex)
+    else:
+        return regex[0]
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+