Commits

Phillip Alday committed b2c9d42

Added option to compute cost of duplication

  • Participants
  • Parent commits ea3030a

Comments (0)

Files changed (1)

 # -*- coding: UTF-8 -*-
 #
 # Copyright (C) 2012 Phillip Alday <phillip.alday@staff.uni-marburg.de>
-# 
+#
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-# This program incorporates work by David Mertz and Martin Blais, 
+#
+# This program incorporates work by David Mertz and Martin Blais,
 # previously released as CC0 (public domain).
-# The original CC0 code can be accessed as find-duplicate-contents.py 
+# The original CC0 code can be accessed as find-duplicate-contents.py
 # in earlier revisions and was "released" as version "-1.0"
 
 """ deduper.py: find (and remove) duplicate files.
-    
+
     Given a root directory, recurse in it and find all the duplicate files:
     files that have the same contents, but not necessarily the same filename.
 """
-from __future__ import print_function 
+from __future__ import print_function
 import sys
 import os
 import os.path
 
 def main(argv=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--size-only', metavar="SIZE", type=int, 
+    parser.add_argument('--size-only', metavar="SIZE", type=int,
                         default=sys.maxint,
                         help="Only use size comparison on files "
                         "larger than SIZE")
     parser.add_argument('--extra-hashes', type=str, default="", nargs="+",
                         help="List of hashes to be carried out in further passes"
                         "but only upon an initial match.")
+    parser.add_argument('--dupe-cost', action="store_true", default=False,
+                        help="List of hashes to be carried out in further passes"
+                        "but only upon an initial match.")
     parser.add_argument('--max-size', type=int, default=sys.maxint,
                         help="Ignore files larger than MAX_SIZE")
     parser.add_argument('--min-size', type=int, default=0,
                         help="Ignore files smaller than MIN_SIZE")
     parser.add_argument('-v', '--verbose', action="store_true", default=False,
                         help="Display progress information on STDERR")
-    parser.add_argument('-c', '--summary-only', action="store_true", 
-                        default=False, 
+    parser.add_argument('-c', '--summary-only', action="store_true",
+                        default=False,
                         help="Display only summary information, i.e. without a "
                         "list of duplicates. Can be used with --verbose to "
                         "display progress without listing files.")
                         help="Limit search to files of a certain extension.")
 
     args = parser.parse_args(argv)
-    args.final_byte_check = False 
+    args.final_byte_check = False
     find_duplicates(args.path, args)
 
 def find_files(args, ext=None):
     """Find all files in the search path optionally matching the extension.
-    
+
     Keyword arguments:
     ext -- filename extension (default None = all extensions)
     """
 
 def group_pairs(pairs):
     """ Group key-value pairs based on identical first item (key).
-        
+
        This function is passed an interable each of whose values is a pair;
        it yields a sequence of pairs whose first element is the identical first
        element from the original pairs, and whose second element is a list of
 
 def find_duplicates(dirs, opts):
     """Find the duplicate files in the given root directory(ies).
-        
+
         Arguments:
-        
+
         dirs -- an iterable of strings containing directories to search
-        opts -- a namespace type object (e.g. from argparse) containing 
+        opts -- a namespace type object (e.g. from argparse) containing
                 the following arguments:
-                -- summary_only      -- do not display list of duplicate files, 
+                -- summary_only      -- do not display list of duplicate files,
                                         only display final statistics
-                -- use_hash          -- specify which hash to use, 
+                -- use_hash          -- specify which hash to use,
                                         hash must be in hashlib
-                -- verbose           -- display progress information during 
-                                        computation of file sizes and 
+                -- verbose           -- display progress information during
+                                        computation of file sizes and
                                         initial hashing
-                -- prompt_for_action -- prompt for action on each set of 
-                                        duplicates logical mutually exclusive 
+                -- prompt_for_action -- prompt for action on each set of
+                                        duplicates logical mutually exclusive
                                         with summary_only
                 -- extension         -- file name extension to restrict search
-                -- max_size          -- ignore files larger than this size 
+                -- max_size          -- ignore files larger than this size
                 -- min_size          -- ignore files larger than this size
-                -- size_only         -- comparisons only on size for files 
-                                        larger than this (i.e. no hashes)              
+                -- size_only         -- comparisons only on size for files
+                                        larger than this (i.e. no hashes)
+                -- dupe_cost        -- calculate the cost of duplication
     """
-    
+
     # the selected hash in string and function form
     hashname = opts.use_hash.upper()
     hashfnc = eval("hashlib.{0}".format(opts.use_hash))
     extra_hashfncs = [eval("hashlib.{0}".format(h)) for h in opts.extra_hashes]
     extra_hashnames = opts.extra_hashes
-    
+
     # Organize all filenames according to size.
-    count = 0           # number of files examined   
-    
+    count = 0           # number of files examined
+
     # initialize sqlite database
     if os.path.exists('sz.db'):
         os.remove('sz.db')
     conn = sqlite3.connect('sz.db')
     c = conn.cursor()
     c.execute('create table files_by_size (size int, fname text)')
-    
+
     if opts.verbose:
         print("Checking sizes (each '.' is 100k files):", file=stderr)
-    
-    # traverse the directory tree, count the files and get their size 
+
+    # traverse the directory tree, count the files and get their size
     # this gets sizes for all files, even files in the ignore range
     for fn in find_files(dirs, opts.extension):
         if not os.path.isfile(fn):
                   (sz, unicode(fn, 'utf-8')))
         if opts.verbose and count % 100000 == 0:
             stderr.write('.')
-    
+
     conn.commit()
-    
+
     if opts.verbose:
         print("\nFound sizes on {0} files...".format(count), file=stderr)
 
                  order by size desc''', (opts.max_size, opts.min_size))
 
     if opts.verbose:
-        print("Grouping files by {0} (each '.' is 5k groups):".format(hashname), 
+        print("Grouping files by {0} (each '.' is 5k groups):".format(hashname),
               file=stderr)
 
     distincts = 0           # number of distinct sets of duplicates
-    null_header = False     # has the label for empty files been printed?  
+    dupe_cost = 0
+    null_header = False     # has the label for empty files been printed?
     empties = 0             # number of empty files
 
     # the call to sqlite should have already sorted this list as required
                 hashes = []
                 # compute hashes only for files smaller than size_only
                 # otherwise go ahead and print sets of size_only matches
+
                 if sz <= opts.size_only:
                     for f in fnames:
                         with open(f) as fh:
                         print('Size: {size} : Size: {size}'.format(size=sz))
                         for f in fnames:
                             print(' ', f)
+                        if opts.dupe_cost:
+                            print("Potentially duplicated space: {}").format(sz * (len(fnames)-1))
                         print('--')
                     distincts += 1
-                
+                    dupe_cost += sz * (len(fnames)-1)
+
 
                 for idx, vals in group_pairs(hashes):
                     # if there is more than one value per hash-set, then
                     # that is another distinct set of duplicates
                     if len(vals) > 1:
-                        extra_match, hash_msgs =  additional_tests(vals, 
-                                                    extra_hashfncs, 
-                                                    extra_hashnames, 
+                        extra_match, hash_msgs =  additional_tests(vals,
+                                                    extra_hashfncs,
+                                                    extra_hashnames,
                                                     opts.final_byte_check)
                         distincts += 1
+                        dupe_cost += sz * (len(vals)-1)
                         if not opts.summary_only:
-                                # if the size of the file is non trivial, 
+                                # if the size of the file is non trivial,
                             # then print the hash, else just print the contents
                             if sz > 40:
                                 print(u'Size: {size}: {hname}:{hmsg} '
-                                        '{extra}'.format(size=sz, 
+                                        '{extra}'.format(size=sz,
                                         hname=hashname, hmsg=idx,
                                         extra=hash_msgs))
                             else:
                                 with open(vals[0]) as fh:
                                     content = fh.read()
-    
+
                                 print('Size: {size}: Content: {con}'.format(
                                         size=sz,con=repr(content)))
+                            # for now, we don't print the cost of duplication
+                            # when taking action -- this seems like it would be
+                            # a complicated and dynamic affair
+                            if opts.dupe_cost:
+                                print("Duplicated space: {}".format(sz * (len(vals)-1)))
+
                             if opts.prompt_for_action:
                                 action_on_file_list(vals)
                             else:
                                 for fn2 in vals:
                                     print(' ', fn2)
+
                             # print a dot for every 5000 sets of duplicates
                             if opts.verbose and distincts % 5000 == 0:
                                 stderr.write('.')
 
     if opts.verbose or opts.summary_only:
         print("\nFound {0} empty files".format(empties), file=stderr)
-        print("Found {0} non-empty duplicate sets".format(distincts), 
+        print("Found {0} non-empty duplicate sets".format(distincts),
                 file=stderr)
+    if opts.dupe_cost:
+        print("\nSpace cost of duplicates: {}".format(dupe_cost), file=stderr)
 
 
 def additional_tests(fnames, hashfncs, hashnames, ncheck_bytes):
         hashes = dict()
         hash_match = True
         hash_msgs = ""
-        
+
         for f in fnames:
             with open(f) as fh:
                 content = fh.read()
                     hashes[name].append((fnc(content).hexdigest(),f))
                 else:
                     hashes[name] = [(fnc(content).hexdigest(),f)]
-        
+
         for h in hashnames:
             if len(list(group_pairs(hashes[h]))) > 1:
                 hash_msgs += "\n\t*****{} does not match*****".format(h)
                 hash_match = False
-        
+
         if hash_msgs != "":
             hash_msgs = u"Extra hashes: {}".format(hash_msgs)
 
 
 def remove_file(fname):
     """ Remove a file.
-    
-        Currently, this just invokes os.remove() to delete the file. 
-        Future releases will support alternative action, e.g. moving all 
+
+        Currently, this just invokes os.remove() to delete the file.
+        Future releases will support alternative action, e.g. moving all
         duplicates to a single folder.
     """
     os.remove(fname)
 
-def action_on_file_list(fnames):
+def action_on_file_list(fnames,**opts):
     """ Prompt for action on a set of duplicates. """
     for i in range(len(fnames)):
         print(u"[{0:>{width}}] {1}".format(