Commits

Anonymous committed 460a826

find-dups can do find-dups2's job now, so remove it.

  • Participants
  • Parent commits f9f2a84

Comments (0)

Files changed (2)

File python/find-dups

 #!/usr/bin/env python
 
-"""
+r"""
 Find duplicate files.
 
-Usage: find-dups [--canonical=prefix] {dir}
+Usage: find-dups [--canonical=<prefix>] [--format=<fmt>] {dir}
 
-If a duplicate is found, and exactly one of the copies has the prefix
-given by `canonical`, commands to remove all the other copies are output.
+Any number of directories may be given.
+
+If --canonical is not given, list all duplicate files found in those
+directories, indexed by their md5 hashes.
+
+If --canonical is given, it is a prefix; list all duplicate files
+that do not have that prefix.
+
+If --format is given, apply that format string to output filenames.
+For example,
+
+    find-dups foo --canonical=foo/bar --format="rm \"%s\""
+
+...will produce a script to remove the duplicate files found somewhere
+in foo other than in foo/bar.
 
 """
 
     parser.add_option("--canonical",
                       dest="canonical",
                       default=None,
-                      help="output commands to delete all duplicates "
+                      help="list all duplicate files found "
                            "that do not have this prefix")
+    parser.add_option("--format",
+                      dest="format",
+                      default="%s",
+                      help="format string for output filenames")
     (options, args) = parser.parse_args()
-    
+
     hashmap = {}
     for directory in args:
+        print "traversing %s..." % directory
         for root, dirs, files in os.walk(directory):
             for filename in files:
                 full = os.path.normpath(os.path.join(root, filename))
                 hash = md5(full)
                 hashmap.setdefault(hash, []).append(full)
 
-    for hash in hashmap:
-        filenames = sorted(hashmap[hash])
-        if len(filenames) > 1:
-            print "# %s:" % hash
-            for filename in filenames:
-                print "#   %s" % filename
-
-    if options.canonical is not None:
+    if options.canonical is None:
+        for hash in hashmap:
+            filenames = sorted(hashmap[hash])
+            if len(filenames) > 1:
+                print "# %s:" % hash
+                for filename in filenames:
+                    print "#   %s" % filename
+    else:
         print
         for hash in hashmap:
             filenames = sorted(hashmap[hash])
                     if filename.startswith(options.canonical):
                         canonicals.append(filename)
                 if len(canonicals) == 1:
-                    print "# delete all except %s" % canonicals[0]
                     for filename in filenames:
                         if not filename.startswith(options.canonical):
-                            print 'rm "%s"' % filename
-                    print
+                            print options.format % filename
 
 if __name__ == '__main__':
     main(sys.argv)

File python/find-dups2

-#!/usr/bin/env python
-
-"""
-For each file somewhere in a, list all duplicates of it somewhere in b.
-
-a may be somewhere inside b.  a will not be traversed during b traveral.
-
-Usage: find-dups2 a b
-"""
-
-import hashlib
-import os
-import sys
-
-
-def md5(filename):
-    """Compute and return the MD5 hash of the named file.
-
-    """
-    hash = hashlib.md5()
-    file = open(filename, "r")
-    eof = False
-    while not eof:
-        data = file.read(128)
-        if data:
-            hash.update(data)
-        else:
-            eof = True
-    file.close()
-    return hash.hexdigest()
-
-
-def build_hashmap(dir, exclude=None):
-    hashmap = {}
-    for root, dirs, files in os.walk(dir):
-        if os.path.normpath(root) == exclude:
-            print "(skipping %s)" % exclude
-            dirs[:] = []
-        else:
-            for filename in files:
-                full = os.path.normpath(os.path.join(root, filename))
-                try:
-                    hash = md5(full)
-                    hashmap.setdefault(hash, []).append(full)
-                except IOError as e:
-                    print str(e)
-    return hashmap
-
-
-### MAIN ###
-
-
-def main(argv):
-    dir_a = os.path.normpath(argv[1])
-    dir_b = os.path.normpath(argv[2])
-    print "traversing %s..." % dir_a
-    a = build_hashmap(dir_a)
-    print "traversing %s..." % dir_b
-    b = build_hashmap(dir_b, exclude=dir_a)
-    print ""
-    for hash in a:
-        if hash in b:
-            for filename in a[hash]:
-                print filename
-            print "... duplicated in :"
-            for filename in b[hash]:
-                print filename
-            print ""
-
-if __name__ == '__main__':
-    main(sys.argv)