Commits

Phillip Alday committed 9d4d718

Initial work on packaging with distutils

  • Participants
  • Parent commits d28bb00
  • Branches feature/distribute

Comments (0)

Files changed (7)

 
 testset/*
 testset.zip
+
+*egg-info*
+

File deduper.py

-#! /usr/bin/env python
-# -*- coding: UTF-8 -*-
-#
-# Copyright (C) 2012 Phillip Alday <phillip.alday@staff.uni-marburg.de>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-#
-# This program incorporates work by David Mertz and Martin Blais,
-# previously released as CC0 (public domain).
-# The original CC0 code can be accessed as find-duplicate-contents.py
-# in earlier revisions and was "released" as version "-1.0"
-
-""" deduper.py: find (and remove) duplicate files.
-
-    Given a root directory, recurse in it and find all the duplicate files:
-    files that have the same contents, but not necessarily the same filename.
-"""
-from __future__ import print_function
-from __future__ import division
-
-import sys
-import os
-import os.path
-import hashlib
-import sqlite3
-import argparse
-
-from sys import stderr
-from itertools import groupby
-from math import log10
-
-# list of file size bases along with the post K/M/G/T prefix letter
-# ie KiB vs KB
-SIZE_BASES = {
-    10:{
-        'TB' : 10**12,
-        'GB' : 10**9,
-        'MB' : 10**6,
-        'KB' : 10**3,
-    },
-    2:{
-        'TiB': 2**40,
-        'GiB': 2**30,
-        'MiB': 2**20,
-        'KiB': 2**10,
-    }
-}
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(
-                        description="A utility for finding and dealing with duplicate files",
-                        epilog = "How much disk space can you save?")
-    parser.add_argument('--size-only', metavar="SIZE", type=str,
-                        default=str(sys.maxint),
-                        help="Only use size comparison on files "
-                        "larger than SIZE")
-    parser.add_argument('--use-hash', type=str, default="sha1",
-                        help="Cryptographic hash to use (must be in hashlib!")
-    parser.add_argument('--extra-hashes', type=str, default="", nargs="+",
-                        help="List of hashes to be carried out in further passes"
-                        "but only upon an initial match.")
-    parser.add_argument('--dupe-cost', action="store_true", default=False,
-                        help="List of hashes to be carried out in further passes"
-                        "but only upon an initial match.")
-    parser.add_argument('-b','--human-readable',metavar="BASE",
-                        type=int, default=0, choices=SIZE_BASES.keys(),
-                        help="Make file sizes human readble in base BASE")
-    parser.add_argument('--max-size', type=str, default=str(sys.maxint),
-                        help="Ignore files larger than MAX_SIZE")
-    parser.add_argument('--min-size', type=str, default="0",
-                        help="Ignore files smaller than MIN_SIZE")
-    parser.add_argument('-v', '--verbose', action="store_true", default=False,
-                        help="Display progress information on STDERR")
-    parser.add_argument('-c', '--summary-only', action="store_true",
-                        default=False,
-                        help="Display only summary information, i.e. without a "
-                        "list of duplicates. Can be used with --verbose to "
-                        "display progress without listing files.")
-    parser.add_argument('-a', '--prompt-for-action', action="store_true",
-                        default=False,
-                        help="Prompt for action by duplicate sets.")
-    parser.add_argument('path', type=str, nargs='+',
-                        help="paths to search")
-    parser.add_argument('-e','--extension', type=str, default=None, nargs='+',
-                        help="Limit search to files of a certain extension.")
-    parser.add_argument('--invert', action="store_true",default=False,
-                        help="Invert selection of extensions, i.e. negative match.")
-
-    args = parser.parse_args(argv)
-    args.final_byte_check = False
-    args.size_only, args.max_size, args.min_size = map(size_to_int, [args.size_only, args.max_size, args.min_size])
-    find_duplicates(args.path, args)
-
-def find_files(args, ext=None, invert=False):
-    """Find all files in the search path optionally matching the extension.
-
-    Keyword arguments:
-    ext -- filename extension (default None = all extensions)
-    """
-    for ddir in args:
-        if os.path.isdir(ddir):
-            for root, dirs, fnames in os.walk(ddir):
-                for f in fnames:
-                    if ext is None or ((os.path.splitext(f)[1] in ext) != invert):
-                            yield os.path.join(root, f)
-        else:
-            if ext is None or ((os.path.splitext(f)[1] in ext) != invert):
-                    yield ddir
-
-def group_pairs(pairs):
-    """ Group key-value pairs based on identical first item (key).
-
-       This function is passed an interable each of whose values is a pair;
-       it yields a sequence of pairs whose first element is the identical first
-       element from the original pairs, and whose second element is a list of
-       second elements corresponding to the same first element. Only adjacent
-       pairs sharing a first element are grouped together, so if the grouping
-       is required to be global, you should pass in 'sorted(pairs)' rather than
-       the raw iterable.  E.g.:
-
-       >>> things = [(1,'foo'), (1,'bar'), (2, 'baz')]
-       >>> group_pairs(things)
-       [(1,['foo','bar']), (2, ['baz'])]
-    """
-    for idx, vals in groupby(pairs, lambda pair: pair[0]):
-        yield (idx, [v[1] for v in vals])
-
-def find_duplicates(dirs, opts):
-    """Find the duplicate files in the given root directory(ies).
-
-        Arguments:
-
-        dirs -- an iterable of strings containing directories to search
-        opts -- a namespace type object (e.g. from argparse) containing
-                the following arguments:
-                -- summary_only      -- do not display list of duplicate files,
-                                        only display final statistics
-                -- use_hash          -- specify which hash to use,
-                                        hash must be in hashlib
-                -- verbose           -- display progress information during
-                                        computation of file sizes and
-                                        initial hashing
-                -- prompt_for_action -- prompt for action on each set of
-                                        duplicates logical mutually exclusive
-                                        with summary_only
-                -- extension         -- file name extensions to restrict search
-                -- invert            -- negative matching for file name extension
-                -- max_size          -- ignore files larger than this size
-                -- min_size          -- ignore files larger than this size
-                -- size_only         -- comparisons only on size for files
-                                        larger than this (i.e. no hashes)
-                -- dupe_cost         -- calculate the cost of duplication
-                -- human_readable    -- the base to use for pretty printing
-                                        the size; 0 for no pretty printing
-    """
-
-    # the selected hash in string and function form
-    hashname = opts.use_hash.upper()
-    hashfnc = eval("hashlib.{0}".format(opts.use_hash))
-    extra_hashfncs = [eval("hashlib.{0}".format(h)) for h in opts.extra_hashes]
-    extra_hashnames = opts.extra_hashes
-
-    # Organize all filenames according to size.
-    count = 0           # number of files examined
-
-    # initialize sqlite database
-    if os.path.exists('sz.db'):
-        os.remove('sz.db')
-    conn = sqlite3.connect('sz.db')
-    c = conn.cursor()
-    c.execute('create table files_by_size (size int, fname text)')
-
-    if opts.verbose:
-        print("Checking sizes (each '.' is 100k files):", file=stderr)
-
-    # traverse the directory tree, count the files and get their size
-    # this gets sizes for all files, even files in the ignore range
-    for fn in find_files(dirs, opts.extension, opts.invert):
-        if not os.path.isfile(fn):
-            continue
-        count += 1
-        sz = os.path.getsize(fn)
-        c.execute('insert into files_by_size values (?, ?)',
-                  (sz, unicode(fn, 'utf-8')))
-        if opts.verbose and count % 100000 == 0:
-            stderr.write('.')
-
-    conn.commit()
-
-    if opts.verbose:
-        print("\nFound sizes on {0} files...".format(count), file=stderr)
-
-    # retrieve the files sorted by size, for min_size <= size <= max_size
-    c.execute('''select size, fname from files_by_size
-                 where size<=? and size>=?
-                 order by size desc''', (opts.max_size, opts.min_size))
-
-    if opts.verbose:
-        print("Grouping files by {0} (each '.' is 5k groups):".format(hashname),
-              file=stderr)
-
-    distincts = 0           # number of distinct sets of duplicates
-    dupe_cost = 0
-    base = opts.human_readable
-    null_header = False     # has the label for empty files been printed?
-    empties = 0             # number of empty files
-
-    # the call to sqlite should have already sorted this list as required
-    for sz, fnames in group_pairs(c):
-        if sz == 0:
-            if not null_header:
-                if not opts.summary_only:
-                    print("Size: 0 : Content: ''")
-                null_header = True
-            for f in fnames:
-                if not opts.summary_only:
-                    print(' ', f)
-                empties += 1
-        else:
-            # We have accumulated some dups that need to be printed
-            if len(fnames) > 1:
-                hashes = []
-                # compute hashes only for files smaller than size_only
-                # otherwise go ahead and print sets of size_only matches
-
-                if sz <= opts.size_only:
-                    for f in fnames:
-                        # some temporary / sqlite-journalling files get caught,
-                        # but then disappear, this basically skips any missing
-                        # files. Other calls to open don't skip -- they assume
-                        # you made it past this check, life must be okay, so
-                        # know that changes to the filesystem will performing
-                        # a traversal on said filesystem will cause problems!
-                        try:
-                            with open(f) as fh:
-                                content = fh.read()
-                            hashes.append((hashfnc(content).hexdigest(), f))
-                        except IOError as ioe:
-                            print(ioe)
-                            print("Skipping {}".format(f))
-                    hashes.sort()
-                else:
-                    if not opts.summary_only:
-                        print('Size: {size} : Size: {size}'.format(size=pretty_size(sz,base=base)))
-                        for f in fnames:
-                            print(' ', f)
-                        if opts.dupe_cost:
-                            print("Potentially duplicated space: {}".format(pretty_size(sz * (len(fnames)-1), base=base)))
-                        print('--')
-                    distincts += 1
-                    dupe_cost += sz * (len(fnames)-1)
-
-
-                for idx, vals in group_pairs(hashes):
-                    # if there is more than one value per hash-set, then
-                    # that is another distinct set of duplicates
-                    if len(vals) > 1:
-                        extra_match, hash_msgs =  additional_tests(vals,
-                                                    extra_hashfncs,
-                                                    extra_hashnames,
-                                                    opts.final_byte_check)
-                        distincts += 1
-                        dupe_cost += sz * (len(vals)-1)
-                        if not opts.summary_only:
-                                # if the size of the file is non trivial,
-                            # then print the hash, else just print the contents
-                            if sz > 40:
-                                print(u'Size: {size}: {hname}:{hmsg} '
-                                        '{extra}'.format(size=pretty_size(sz,base=base),
-                                        hname=hashname, hmsg=idx,
-                                        extra=hash_msgs))
-                            else:
-                                with open(vals[0]) as fh:
-                                    content = fh.read()
-
-                                print('Size: {size}: Content: {con}'.format(size=pretty_size(sz,base=base),con=repr(content)))
-                            # for now, we don't print the cost of duplication
-                            # when taking action -- this seems like it would be
-                            # a complicated and dynamic affair
-                            if opts.dupe_cost:
-                                print("Duplicated space: {}".format(pretty_size(sz * (len(vals)-1), base=base)))
-
-                            if opts.prompt_for_action:
-                                action_on_file_list(vals)
-                            else:
-                                for fn2 in vals:
-                                    print(' ', fn2)
-
-                            # print a dot for every 5000 sets of duplicates
-                            if opts.verbose and distincts % 5000 == 0:
-                                stderr.write('.')
-
-
-    if opts.verbose or opts.summary_only:
-        print("\nFound {0} empty files".format(empties), file=stderr)
-        print("Found {0} non-empty duplicate sets".format(distincts),
-                file=stderr)
-    if opts.dupe_cost:
-        print("\nSpace cost of duplicates: {}".format(pretty_size(dupe_cost,base=base)), file=stderr)
-
-def additional_tests(fnames, hashfncs, hashnames, ncheck_bytes):
-    if len(hashfncs) == 0:
-        hash_match = True
-        hash_msgs = ""
-    else:
-        # place holder code!
-        hashes = dict()
-        hash_match = True
-        hash_msgs = ""
-
-        for f in fnames:
-            with open(f) as fh:
-                content = fh.read()
-            for (fnc,name) in zip(hashfncs,hashnames):
-                if name in hashes:
-                    hashes[name].append((fnc(content).hexdigest(),f))
-                else:
-                    hashes[name] = [(fnc(content).hexdigest(),f)]
-
-        for h in hashnames:
-            if len(list(group_pairs(hashes[h]))) > 1:
-                hash_msgs += "\n\t*****{} does not match*****".format(h)
-                hash_match = False
-
-        if hash_msgs != "":
-            hash_msgs = u"Extra hashes: {}".format(hash_msgs)
-
-    if ncheck_bytes == -1:
-        byte_match = True
-    else:
-        # place holder code!
-        byte_match = True
-
-    return hash_match and byte_match, hash_msgs
-
-def hyphen_range(s):
-    """ yield each integer from a complex range string like "1-9,12, 15-20,23"
-
-    from http://code.activestate.com/recipes/577279-generate-list-of-numbers-from-hyphenated-and-comma/
-
-    >>> list(hyphen_range('1-9,12, 15-20,23'))
-    [1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 18, 19, 20, 23]
-
-    >>> list(hyphen_range('1-9,12, 15-20,2-3-4'))
-    Traceback (most recent call last):
-        ...
-    ValueError: format error in 2-3-4
-    """
-    if s == "":
-        raise StopIteration("empty range!")
-
-    for x in s.split(','):
-        elem = x.split('-')
-        if len(elem) == 1: # a number
-            yield int(elem[0])
-        elif len(elem) == 2: # a range inclusive
-            start, end = map(int, elem)
-            for i in xrange(start, end+1):
-                yield i
-        else: # more than one hyphen
-            raise ValueError('format error in {0}'.format(x))
-
-def remove_file(fname):
-    """ Remove a file.
-
-        Currently, this just invokes os.remove() to delete the file.
-        Future releases will support alternative action, e.g. moving all
-        duplicates to a single folder.
-    """
-    os.remove(fname)
-
-def action_on_file_list(fnames,**opts):
-    """ Prompt for action on a set of duplicates. """
-    for i in range(len(fnames)):
-        print(u"[{0:>{width}}] {1}".format(
-                i, fnames[i], width=int(log10(len(fnames)))))
-
-    items = list(hyphen_range(raw_input("  Entries to delete: ")))
-    if len(items) > 0:
-        for i in items:
-            remove_file(fnames[i])
-    else:
-        items = hyphen_range(raw_input("  Entries to keep "
-                                       "(all others will be deleted, "
-                                       "enter none to keep all): "))
-        for i in range(len(fnames)):
-            if i not in items:
-                remove_file(fnames[i])
-    print("")
-
-def pretty_size(bytes,base=2):
-    """ Pretty print the size of a file using the given base."""
-    global SIZE_BASES
-
-    if base == 0:
-        return bytes
-    elif base not in SIZE_BASES:
-        raise ValueError("Invalid metric prefix base: {}".format(base))
-    else:
-        for suffix in sorted(SIZE_BASES[base], key=SIZE_BASES[base].get, reverse=True):
-            if bytes > SIZE_BASES[base][suffix]:
-                return "{0:.2f}{1}".format(bytes / SIZE_BASES[base][suffix], suffix)
-        else:
-            # we can always fail back to non pretty printed output
-            return bytes
-
-def size_to_int(size):
-    """ Expand the size given with metric/binary suffixes."""
-    size = size.strip()
-
-    if size.isdigit():
-        return int(size)
-
-    global SIZE_BASES
-
-    for b in SIZE_BASES:
-        for suffix in sorted(SIZE_BASES[b], key=SIZE_BASES[b].get, reverse=True):
-            if size.endswith(suffix):
-                 s = int(float(size[:-(len(suffix)+1)]) *  SIZE_BASES[b][suffix])
-                 return s
-    else:
-        raise ValueError("Invalid Suffix on {}".format(size))
-
-if __name__ == '__main__':
-    sys.exit(main())
-

File deduper/__init__.py

+#! /usr/bin/env python
+# -*- coding: UTF-8 -*-
+#
+# Copyright (C) 2012 Phillip Alday <phillip.alday@staff.uni-marburg.de>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.

File deduper/deduper.py

+#! /usr/bin/env python
+# -*- coding: UTF-8 -*-
+#
+# Copyright (C) 2012 Phillip Alday <phillip.alday@staff.uni-marburg.de>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#
+# This program incorporates work by David Mertz and Martin Blais,
+# previously released as CC0 (public domain).
+# The original CC0 code can be accessed as find-duplicate-contents.py
+# in earlier revisions and was "released" as version "-1.0"
+
+""" deduper.py: find (and remove) duplicate files.
+
+    Given a root directory, recurse in it and find all the duplicate files:
+    files that have the same contents, but not necessarily the same filename.
+"""
+from __future__ import print_function
+from __future__ import division
+
+import sys
+import os
+import os.path
+import hashlib
+import sqlite3
+import argparse
+
+from sys import stderr
+from itertools import groupby
+from math import log10
+
+# list of file size bases along with the post K/M/G/T prefix letter
+# ie KiB vs KB
+SIZE_BASES = {
+    10:{
+        'TB' : 10**12,
+        'GB' : 10**9,
+        'MB' : 10**6,
+        'KB' : 10**3,
+    },
+    2:{
+        'TiB': 2**40,
+        'GiB': 2**30,
+        'MiB': 2**20,
+        'KiB': 2**10,
+    }
+}
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+                        description="A utility for finding and dealing with duplicate files",
+                        epilog = "How much disk space can you save?")
+    parser.add_argument('--size-only', metavar="SIZE", type=str,
+                        default=str(sys.maxint),
+                        help="Only use size comparison on files "
+                        "larger than SIZE")
+    parser.add_argument('--use-hash', type=str, default="sha1",
+                        help="Cryptographic hash to use (must be in hashlib!")
+    parser.add_argument('--extra-hashes', type=str, default="", nargs="+",
+                        help="List of hashes to be carried out in further passes"
+                        "but only upon an initial match.")
+    parser.add_argument('--dupe-cost', action="store_true", default=False,
+                        help="List of hashes to be carried out in further passes"
+                        "but only upon an initial match.")
+    parser.add_argument('-b','--human-readable',metavar="BASE",
+                        type=int, default=0, choices=SIZE_BASES.keys(),
+                        help="Make file sizes human readble in base BASE")
+    parser.add_argument('--max-size', type=str, default=str(sys.maxint),
+                        help="Ignore files larger than MAX_SIZE")
+    parser.add_argument('--min-size', type=str, default="0",
+                        help="Ignore files smaller than MIN_SIZE")
+    parser.add_argument('-v', '--verbose', action="store_true", default=False,
+                        help="Display progress information on STDERR")
+    parser.add_argument('-c', '--summary-only', action="store_true",
+                        default=False,
+                        help="Display only summary information, i.e. without a "
+                        "list of duplicates. Can be used with --verbose to "
+                        "display progress without listing files.")
+    parser.add_argument('-a', '--prompt-for-action', action="store_true",
+                        default=False,
+                        help="Prompt for action by duplicate sets.")
+    parser.add_argument('path', type=str, nargs='+',
+                        help="paths to search")
+    parser.add_argument('-e','--extension', type=str, default=None, nargs='+',
+                        help="Limit search to files of a certain extension.")
+    parser.add_argument('--invert', action="store_true",default=False,
+                        help="Invert selection of extensions, i.e. negative match.")
+
+    args = parser.parse_args(argv)
+    args.final_byte_check = False
+    args.size_only, args.max_size, args.min_size = map(size_to_int, [args.size_only, args.max_size, args.min_size])
+    find_duplicates(args.path, args)
+
+def find_files(args, ext=None, invert=False):
+    """Find all files in the search path optionally matching the extension.
+
+    Keyword arguments:
+    ext -- filename extension (default None = all extensions)
+    """
+    for ddir in args:
+        if os.path.isdir(ddir):
+            for root, dirs, fnames in os.walk(ddir):
+                for f in fnames:
+                    if ext is None or ((os.path.splitext(f)[1] in ext) != invert):
+                            yield os.path.join(root, f)
+        else:
+            if ext is None or ((os.path.splitext(f)[1] in ext) != invert):
+                    yield ddir
+
+def group_pairs(pairs):
+    """ Group key-value pairs based on identical first item (key).
+
+       This function is passed an interable each of whose values is a pair;
+       it yields a sequence of pairs whose first element is the identical first
+       element from the original pairs, and whose second element is a list of
+       second elements corresponding to the same first element. Only adjacent
+       pairs sharing a first element are grouped together, so if the grouping
+       is required to be global, you should pass in 'sorted(pairs)' rather than
+       the raw iterable.  E.g.:
+
+       >>> things = [(1,'foo'), (1,'bar'), (2, 'baz')]
+       >>> group_pairs(things)
+       [(1,['foo','bar']), (2, ['baz'])]
+    """
+    for idx, vals in groupby(pairs, lambda pair: pair[0]):
+        yield (idx, [v[1] for v in vals])
+
+def find_duplicates(dirs, opts):
+    """Find the duplicate files in the given root directory(ies).
+
+        Arguments:
+
+        dirs -- an iterable of strings containing directories to search
+        opts -- a namespace type object (e.g. from argparse) containing
+                the following arguments:
+                -- summary_only      -- do not display list of duplicate files,
+                                        only display final statistics
+                -- use_hash          -- specify which hash to use,
+                                        hash must be in hashlib
+                -- verbose           -- display progress information during
+                                        computation of file sizes and
+                                        initial hashing
+                -- prompt_for_action -- prompt for action on each set of
+                                        duplicates logical mutually exclusive
+                                        with summary_only
+                -- extension         -- file name extensions to restrict search
+                -- invert            -- negative matching for file name extension
+                -- max_size          -- ignore files larger than this size
+                -- min_size          -- ignore files larger than this size
+                -- size_only         -- comparisons only on size for files
+                                        larger than this (i.e. no hashes)
+                -- dupe_cost         -- calculate the cost of duplication
+                -- human_readable    -- the base to use for pretty printing
+                                        the size; 0 for no pretty printing
+    """
+
+    # the selected hash in string and function form
+    hashname = opts.use_hash.upper()
+    hashfnc = eval("hashlib.{0}".format(opts.use_hash))
+    extra_hashfncs = [eval("hashlib.{0}".format(h)) for h in opts.extra_hashes]
+    extra_hashnames = opts.extra_hashes
+
+    # Organize all filenames according to size.
+    count = 0           # number of files examined
+
+    # initialize sqlite database
+    if os.path.exists('sz.db'):
+        os.remove('sz.db')
+    conn = sqlite3.connect('sz.db')
+    c = conn.cursor()
+    c.execute('create table files_by_size (size int, fname text)')
+
+    if opts.verbose:
+        print("Checking sizes (each '.' is 100k files):", file=stderr)
+
+    # traverse the directory tree, count the files and get their size
+    # this gets sizes for all files, even files in the ignore range
+    for fn in find_files(dirs, opts.extension, opts.invert):
+        if not os.path.isfile(fn):
+            continue
+        count += 1
+        sz = os.path.getsize(fn)
+        c.execute('insert into files_by_size values (?, ?)',
+                  (sz, unicode(fn, 'utf-8')))
+        if opts.verbose and count % 100000 == 0:
+            stderr.write('.')
+
+    conn.commit()
+
+    if opts.verbose:
+        print("\nFound sizes on {0} files...".format(count), file=stderr)
+
+    # retrieve the files sorted by size, for min_size <= size <= max_size
+    c.execute('''select size, fname from files_by_size
+                 where size<=? and size>=?
+                 order by size desc''', (opts.max_size, opts.min_size))
+
+    if opts.verbose:
+        print("Grouping files by {0} (each '.' is 5k groups):".format(hashname),
+              file=stderr)
+
+    distincts = 0           # number of distinct sets of duplicates
+    dupe_cost = 0
+    base = opts.human_readable
+    null_header = False     # has the label for empty files been printed?
+    empties = 0             # number of empty files
+
+    # the call to sqlite should have already sorted this list as required
+    for sz, fnames in group_pairs(c):
+        if sz == 0:
+            if not null_header:
+                if not opts.summary_only:
+                    print("Size: 0 : Content: ''")
+                null_header = True
+            for f in fnames:
+                if not opts.summary_only:
+                    print(' ', f)
+                empties += 1
+        else:
+            # We have accumulated some dups that need to be printed
+            if len(fnames) > 1:
+                hashes = []
+                # compute hashes only for files smaller than size_only
+                # otherwise go ahead and print sets of size_only matches
+
+                if sz <= opts.size_only:
+                    for f in fnames:
+                        # some temporary / sqlite-journalling files get caught,
+                        # but then disappear, this basically skips any missing
+                        # files. Other calls to open don't skip -- they assume
+                        # you made it past this check, life must be okay, so
+                        # know that changes to the filesystem will performing
+                        # a traversal on said filesystem will cause problems!
+                        try:
+                            with open(f) as fh:
+                                content = fh.read()
+                            hashes.append((hashfnc(content).hexdigest(), f))
+                        except IOError as ioe:
+                            print(ioe)
+                            print("Skipping {}".format(f))
+                    hashes.sort()
+                else:
+                    if not opts.summary_only:
+                        print('Size: {size} : Size: {size}'.format(size=pretty_size(sz,base=base)))
+                        for f in fnames:
+                            print(' ', f)
+                        if opts.dupe_cost:
+                            print("Potentially duplicated space: {}".format(pretty_size(sz * (len(fnames)-1), base=base)))
+                        print('--')
+                    distincts += 1
+                    dupe_cost += sz * (len(fnames)-1)
+
+
+                for idx, vals in group_pairs(hashes):
+                    # if there is more than one value per hash-set, then
+                    # that is another distinct set of duplicates
+                    if len(vals) > 1:
+                        extra_match, hash_msgs =  additional_tests(vals,
+                                                    extra_hashfncs,
+                                                    extra_hashnames,
+                                                    opts.final_byte_check)
+                        distincts += 1
+                        dupe_cost += sz * (len(vals)-1)
+                        if not opts.summary_only:
+                                # if the size of the file is non trivial,
+                            # then print the hash, else just print the contents
+                            if sz > 40:
+                                print(u'Size: {size}: {hname}:{hmsg} '
+                                        '{extra}'.format(size=pretty_size(sz,base=base),
+                                        hname=hashname, hmsg=idx,
+                                        extra=hash_msgs))
+                            else:
+                                with open(vals[0]) as fh:
+                                    content = fh.read()
+
+                                print('Size: {size}: Content: {con}'.format(size=pretty_size(sz,base=base),con=repr(content)))
+                            # for now, we don't print the cost of duplication
+                            # when taking action -- this seems like it would be
+                            # a complicated and dynamic affair
+                            if opts.dupe_cost:
+                                print("Duplicated space: {}".format(pretty_size(sz * (len(vals)-1), base=base)))
+
+                            if opts.prompt_for_action:
+                                action_on_file_list(vals)
+                            else:
+                                for fn2 in vals:
+                                    print(' ', fn2)
+
+                            # print a dot for every 5000 sets of duplicates
+                            if opts.verbose and distincts % 5000 == 0:
+                                stderr.write('.')
+
+
+    if opts.verbose or opts.summary_only:
+        print("\nFound {0} empty files".format(empties), file=stderr)
+        print("Found {0} non-empty duplicate sets".format(distincts),
+                file=stderr)
+    if opts.dupe_cost:
+        print("\nSpace cost of duplicates: {}".format(pretty_size(dupe_cost,base=base)), file=stderr)
+
+def additional_tests(fnames, hashfncs, hashnames, ncheck_bytes):
+    if len(hashfncs) == 0:
+        hash_match = True
+        hash_msgs = ""
+    else:
+        # place holder code!
+        hashes = dict()
+        hash_match = True
+        hash_msgs = ""
+
+        for f in fnames:
+            with open(f) as fh:
+                content = fh.read()
+            for (fnc,name) in zip(hashfncs,hashnames):
+                if name in hashes:
+                    hashes[name].append((fnc(content).hexdigest(),f))
+                else:
+                    hashes[name] = [(fnc(content).hexdigest(),f)]
+
+        for h in hashnames:
+            if len(list(group_pairs(hashes[h]))) > 1:
+                hash_msgs += "\n\t*****{} does not match*****".format(h)
+                hash_match = False
+
+        if hash_msgs != "":
+            hash_msgs = u"Extra hashes: {}".format(hash_msgs)
+
+    if ncheck_bytes == -1:
+        byte_match = True
+    else:
+        # place holder code!
+        byte_match = True
+
+    return hash_match and byte_match, hash_msgs
+
+def hyphen_range(s):
+    """ yield each integer from a complex range string like "1-9,12, 15-20,23"
+
+    from http://code.activestate.com/recipes/577279-generate-list-of-numbers-from-hyphenated-and-comma/
+
+    >>> list(hyphen_range('1-9,12, 15-20,23'))
+    [1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 18, 19, 20, 23]
+
+    >>> list(hyphen_range('1-9,12, 15-20,2-3-4'))
+    Traceback (most recent call last):
+        ...
+    ValueError: format error in 2-3-4
+    """
+    if s == "":
+        raise StopIteration("empty range!")
+
+    for x in s.split(','):
+        elem = x.split('-')
+        if len(elem) == 1: # a number
+            yield int(elem[0])
+        elif len(elem) == 2: # a range inclusive
+            start, end = map(int, elem)
+            for i in xrange(start, end+1):
+                yield i
+        else: # more than one hyphen
+            raise ValueError('format error in {0}'.format(x))
+
+def remove_file(fname):
+    """ Remove a file.
+
+        Currently, this just invokes os.remove() to delete the file.
+        Future releases will support alternative action, e.g. moving all
+        duplicates to a single folder.
+    """
+    os.remove(fname)
+
+def action_on_file_list(fnames,**opts):
+    """ Prompt for action on a set of duplicates. """
+    for i in range(len(fnames)):
+        print(u"[{0:>{width}}] {1}".format(
+                i, fnames[i], width=int(log10(len(fnames)))))
+
+    items = list(hyphen_range(raw_input("  Entries to delete: ")))
+    if len(items) > 0:
+        for i in items:
+            remove_file(fnames[i])
+    else:
+        items = hyphen_range(raw_input("  Entries to keep "
+                                       "(all others will be deleted, "
+                                       "enter none to keep all): "))
+        for i in range(len(fnames)):
+            if i not in items:
+                remove_file(fnames[i])
+    print("")
+
+def pretty_size(bytes,base=2):
+    """ Pretty print the size of a file using the given base."""
+    global SIZE_BASES
+
+    if base == 0:
+        return bytes
+    elif base not in SIZE_BASES:
+        raise ValueError("Invalid metric prefix base: {}".format(base))
+    else:
+        for suffix in sorted(SIZE_BASES[base], key=SIZE_BASES[base].get, reverse=True):
+            if bytes > SIZE_BASES[base][suffix]:
+                return "{0:.2f}{1}".format(bytes / SIZE_BASES[base][suffix], suffix)
+        else:
+            # we can always fail back to non pretty printed output
+            return bytes
+
+def size_to_int(size):
+    """ Expand the size given with metric/binary suffixes."""
+    size = size.strip()
+
+    if size.isdigit():
+        return int(size)
+
+    global SIZE_BASES
+
+    for b in SIZE_BASES:
+        for suffix in sorted(SIZE_BASES[b], key=SIZE_BASES[b].get, reverse=True):
+            if size.endswith(suffix):
+                 s = int(float(size[:-(len(suffix)+1)]) *  SIZE_BASES[b][suffix])
+                 return s
+    else:
+        raise ValueError("Invalid Suffix on {}".format(size))
+
+if __name__ == '__main__':
+    sys.exit(main())
+
+[egg_info]
+tag_build = dev
+from distutils.core import setup, find_packages
+import sys, os
+
+version = '0.2'
+
+def read(fname):
+''' Read a file in as a string. '''
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+setup(name='Deduper',
+    packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
+    version=version,
+    author = "Phillip Alday",
+    author_email = "phillip.alday@staff.uni-marburg.de"
+    description="A utility to find duplicated files, regardless of file name.",
+    license = "GPLv2",
+    keywords = "deduplication",
+    url = "https://bitbucket.org/palday/deduper",
+    long_description=read('README.rst'),
+    include_package_data=True,
+    zip_safe=False,
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: End Users/Desktop",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: System Administrators",
+        "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
+        "Programming Language :: Python",
+        "Topic :: Utilities"
+    ],    
+)

File tests/__init__.py

+#! /usr/bin/env python
+# -*- coding: UTF-8 -*-
+#
+# Copyright (C) 2012 Phillip Alday <phillip.alday@staff.uni-marburg.de>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.