Commits

Anonymous committed 15cfe09

dedup

  • Participants
  • Parent commits a65c996

Comments (0)

Files changed (2)

File ocaml/OMakefile

-########################################################################
-# Permission is hereby granted, free of charge, to any person
-# obtaining a copy of this file, to deal in the File without
-# restriction, including without limitation the rights to use,
-# copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the File, and to permit persons to whom the
-# File is furnished to do so, subject to the following condition:
-#
-# THE FILE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE FILE OR
-# THE USE OR OTHER DEALINGS IN THE FILE.
-
-########################################################################
-# The standard OMakefile.
-# You will usually need to modify this file for your project.
-
 OCAML_WHERE=$(shell ocamlc -where)
 
-########################################################################
-# Phony targets are scoped, so you probably want to declare them first.
-#
-
 .PHONY: all install clean
 
-########################################################################
-# Subdirectories.
-# You may want to include some subdirectories in this project.
-# If so, define the subdirectory targets and uncomment this section.
-#
-
 # .SUBDIRS:
 
-########################################################################
-# C configuration.
-# Delete this section if you are not building C files.
-#
-
-################################################
-# Configuration.  You might want to modify any of these
-# configuration variables.
-#
-
 CFLAGS += -g
 # ASFLAGS +=
 # LDFLAGS +=
 INCLUDES += $(OCAML_WHERE)
 
-################################################
-# Uncomment the following section if you want
-# to build a C program in the current directory.
-#
-
 # CFILES[] =
 #    file1
 #    main
 #
 # .DEFAULT: $(CProgram $(MAIN), $(CFILES))
 
-################################################
-# Uncomment the following section if you want to build a C library
-# in the current directory.
-#
-
 LIBFILES[] =
     chello
 #    file1
 #
 LIB = libhello
 
-.DEFAULT: $(StaticCLibrary $(LIB), $(LIBFILES))
+# .DEFAULT: $(StaticCLibrary $(LIB), $(LIBFILES))
 
 ########################################################################
 # OCaml configuration.
 #    pack1
 #    pack2
 #
-# if $(not $(OCAMLFIND_EXISTS))
-#    eprintln(This project requires ocamlfind, but is was not found.)
-#    eprintln(You need to install ocamlfind and run "omake --configure".)
-#    exit 1
-
-#
-# Include path
-#
 # OCAMLINCLUDES +=
-
-#
-# Compile native or byte code? 
-#
-# The default values are defined as follows:
-#
-# NATIVE_ENABLED = $(OCAMLOPT_EXISTS)
-# BYTE_ENABLED = $(not $(OCAMLOPT_EXISTS))
-
 #
 # Various options
 #
 #
 # .DEFAULT: $(OCamlLibrary $(LIB), $(FILES))
 
-################################################
-# Build an OCaml program
-#
-
-# FILES[] =
-#    file1
-#    file2
-#
-# PROGRAM =
-# OCAML_LIBS += str
-# OCAML_CLIBS +=
-# OCAML_OTHER_LIBS += str
-# OCAML_LIB_FLAGS +=
 #
 # .DEFAULT: $(OCamlProgram $(PROGRAM), $(FILES))
 
 .DEFAULT: $(OCamlProgram cat, cat)
 .DEFAULT: $(OCamlProgram mysh, mysh)
+
 OCAML_OTHER_LIBS += str
 .DEFAULT: $(OCamlProgram grep, grep)
 .DEFAULT: $(OCamlProgram prime, prime)
 .DEFAULT: $(OCamlProgram qsort, qsort)
 
+OCAML_OTHER_LIBS += unix
+.DEFAULT: $(OCamlProgram dedup, dedup)
+
 OCAML_CLIBS += libhello
-.DEFAULT: $(OCamlProgram hello, hello)
+# .DEFAULT: $(OCamlProgram hello, hello)

File ocaml/dedup.ml

+
+
+module M = Map.Make (
+  struct
+    type t = string
+    let compare = Pervasives.compare
+  end)
+
+exception Bad_arg
+
+module LimitQ =
+struct
+  type 'a queue = ((int * 'a) list) * int (* limit *)
+  let empty limit =
+    if limit > 0 then ([], limit) else raise Bad_arg
+
+  let insert queue prio elt =
+    let list,limit = queue in
+    let rec ins rest = function
+      | [] -> List.rev_append rest [(prio,elt)];
+      | (p,e)::tl when prio < p ->
+        List.rev_append rest ((prio,elt)::(p,e)::tl);
+      | (p,e)::tl when prio >= p -> ins ((p,e)::rest) tl;
+      | _::_ -> raise Bad_arg
+    in
+    let list' = ins [] list in
+    if (List.length list') > limit then (List.tl list'),limit
+    else list',limit
+
+  exception Queue_is_empty
+  let pop = function
+    | [],_ -> raise Queue_is_empty
+    | hd::tl,limit -> (hd, (tl,limit))
+end
+    
+    
+
+type node = {
+  size: int;
+  checksum: string;
+  name: string;
+}
+
+let maybe_add s md5 path m =
+  let l = try M.find md5 m with Not_found -> [] in
+  let node = {size=s; checksum=md5; name=path} in
+  (s, md5, (M.add md5 (node::l) m));;
+
+let rec search_dup_fold m path =
+  let dirs = Sys.readdir path in
+  let rec fold total_size acc_md5 acc_m = function
+    | [] -> total_size, acc_md5, acc_m;
+    | hd::tl ->
+      begin
+        let s, md5, m2 = handle_name acc_m (Filename.concat path hd) in
+        let md5' = Digest.to_hex (Digest.string (md5 ^ acc_md5)) in
+        fold (total_size + s) (md5') m2 tl
+      end
+  in
+  let s,md5,m2 = fold 0 "" m (Array.to_list dirs) in
+  maybe_add s md5 path m2
+
+and handle_name m path =
+  if Sys.is_directory path then
+    search_dup_fold m path
+
+  else
+    let md5 = Digest.to_hex (Digest.file path) in
+    let stat = Unix.stat path in
+    let size = stat.Unix.st_size in
+    maybe_add size md5 path m
+
+let search_dup path =
+  Printf.printf "searching for dup files in %s\n" path;
+  let (_, _, t) = handle_name M.empty path in
+  t;;
+
+let print_tree t =
+  (* List.map (fun s -> print_endline s) t;; *)
+  M.iter (fun k nodes ->
+    Printf.printf "\n%s: " k;
+    List.iter (fun n -> Printf.printf "(%s %d)," n.name n.size) nodes)
+    t
+
+let sort_tree_by_size tree =
+  let rec sort k nodes q =
+    if List.length nodes > 1 then
+      let n = List.hd nodes in LimitQ.insert q n.size nodes
+    else q
+  in
+  M.fold sort tree (LimitQ.empty 64);;
+
+let rec print_list q =
+  try
+    let (p,nodes),nextq = LimitQ.pop q in
+    let n = List.hd nodes in
+    Printf.printf "%s \t(%d): " (String.sub n.checksum 0 6) p;
+    List.iter (fun n -> Printf.printf "%s " n.name) nodes;
+    print_endline "";
+    print_list nextq
+  with _ ->
+    ()
+
+let main () =
+  let path =
+    if Array.length Sys.argv = 1 then "."
+    else Sys.argv.(1)
+  in
+  let tree = search_dup path in
+  (* print_tree tree; *)
+  print_list (sort_tree_by_size tree);;
+
+
+let _ =
+  main ();;