Commits

Sebastien Mondet  committed 43626a2

Add more experiments (lwt-count, and trimmers)

  • Participants
  • Parent commits 0276a75

Comments (0)

Files changed (1)

File fastqbench.ml

     in
     loop ())
 
+let lwt_count_lines ~buffer_size file =
+  let count = ref 0 in
+  Lwt_io.with_file ~buffer_size ~mode:Lwt_io.input file (fun i ->
+    let rec loop () =
+      Lwt.catch 
+        (fun () ->
+          Lwt_io.read_line i >>= fun _ ->
+          incr count;
+          loop ())
+        (fun exn -> return ()) in
+    loop ())
+  >>= fun () ->
+  return (!count / 4)
+    
 let enum_read_counter file =
   let e = BatFile.lines_of file in
   let count = BatEnum.hard_count e / 4 in
       BatIO.nread i buffer_size 
     with
       BatIO.No_more_input -> raise BatEnum.No_more_elements) in
+  let count = ref 0 in
+  let ec =
+    BatEnum.map (fun s ->
+      let lines_there =
+        BatString.fold_left (fun a b ->
+          a + (if b = '\n' then 1 else 0)) 0 s in
+      count := lines_there + !count;
+      s
+    ) e in
+  let _ = BatEnum.hard_count ec in
+  !count / 4
+
+let enum_transform_asking_for_buffer_size ~buffer_size file =
+  let i = BatFile.open_in file in
+  let e = BatEnum.from (fun () ->
+    try
+      BatIO.nread i buffer_size 
+    with
+      BatIO.No_more_input -> raise BatEnum.No_more_elements) in
   let tr = Biocaml_fastq.enum_parser ~filename:file  in
   BatEnum.hard_count (tr e)
     
+let fastq_file_trimmer filename =
+  let counter_transform =
+object
+  val mutable id =  0
+  method feed () = ()
+  method next = id <- id + 1; `output id
+end in
+  Biocaml_transform.(
+    (compose
+       (mix
+          (compose
+             (compose
+                (new Biocaml_fastq.fastq_parser ~filename ())
+                (new Biocaml_fastq.trimmer (`beginning 10)))
+             (new Biocaml_fastq.trimmer (`ending 2)))
+          counter_transform
+          ~f:(fun r c ->
+            { r with Biocaml_fastq.name =
+                Printf.sprintf "%s -- %d" r.Biocaml_fastq.name c }))
+       (new Biocaml_fastq.fastq_printer)))
+(*  string  ---  fastq-record --- trimmed-fast \
+                                               f --- named-fastq --- string 
+    unit  ---  count --------------------------/                              *)    
+
+exception Trim_error of 
+    [ `left of
+        [ `left of
+            [ `left of
+               [ `left of Biocaml_fastq.parser_error
+               | `right of [ `invalid_size of int ] ]
+            | `right of [ `invalid_size of int ] ]
+        | `right of Biocaml_fastq.empty ]
+    | `right of Biocaml_fastq.empty ]
+
+let enum_trimmer ~input_buffer_size filename =
+  let outfile = sprintf "out_%s" filename in
+  let i = BatFile.open_in filename in
+  let e = BatEnum.from (fun () ->
+    try
+      (BatIO.nread i input_buffer_size, ()) 
+    with
+      BatIO.No_more_input -> raise BatEnum.No_more_elements) in
+  let tr =
+    Biocaml_transform.enum_transformation
+      ~error_to_exn:(fun e -> raise (Trim_error e))
+      (fastq_file_trimmer filename) e in
+  BatFile.with_file_out outfile (fun o ->
+    BatEnum.iter (fun s ->
+      BatIO.nwrite o s;
+    ) (tr);
+  );
+  ()
+
+let lwt_trimmer ~input_buffer_size ~output_buffer_size filename =
+  let outfile = sprintf "lout_%s" filename in
+  let transform = fastq_file_trimmer filename in
+  Lwt_io.with_file filename
+    ~buffer_size:input_buffer_size
+    ~mode:Lwt_io.input  (fun i ->
+      Lwt_io.with_file outfile
+        ~buffer_size:output_buffer_size
+        ~mode:Lwt_io.output  (fun o ->
+          let rec loop () =
+            Lwt_io.read ~count:input_buffer_size i
+            >>= fun read_string ->
+            if read_string = "" then
+              return ()
+            else (
+              transform#feed (read_string, ());
+              let rec subloop () =
+                match transform#next with
+                | `output s ->
+                  Lwt_io.write o s >>= fun () ->
+                  subloop ()
+                | `not_ready ->
+                  return ()
+                | `error e -> Lwt.fail (Trim_error e)
+              in
+              subloop ()
+              >>= fun () ->
+              loop ())
+          in
+          loop ()))
+  
+    
 let do_bench exp_name repetitions buffer_sizes files =
+  let lwt_count_bench file f name =
+    Lwt_list.map_s (fun buffer_size ->
+      let reads = ref 0 in
+      let rec iteration = function
+        | 0 -> return ()
+        | n ->
+          f ~buffer_size file >>= fun r ->
+          reads := r;
+          iteration (n - 1)
+      in
+      let start = Time.now () in
+      iteration repetitions >>= fun () ->
+      let time = Time.(diff (now ()) start) in
+      return (buffer_size, !reads, Core.Span.to_float time)
+    ) buffer_sizes
+    >>= fun lwt_io_results ->
+    Lwt_list.iter_s (fun (bufs, reads, time) ->
+      Lwt_io.printf "{c|%s} {c|%d} {c|%f (%f)}\n"
+        (name bufs) reads time (time /. float repetitions))
+      lwt_io_results
+  in
+  let non_lwt_count_bench file f name =
+    let reads = ref 0 in
+    let rec iteration = function
+      | 0 -> return ()
+      | n -> reads := f file; iteration (n - 1) in
+    let start = Time.now () in
+    iteration repetitions >>= fun () ->
+    let time = Time.(diff (now ()) start) |! Core.Span.to_float in
+    Lwt_io.printf "{c|%s} {c|%d} {c|%f (%f)}\n"
+      name !reads time (time /. float repetitions)
+  in
+  let non_lwt_trim_bench file f name =
+    let rec iteration = function
+      | 0 -> return ()
+      | n -> f file; iteration (n - 1) in
+    let start = Time.now () in
+    iteration repetitions >>= fun () ->
+    let time = Time.(diff (now ()) start) |! Core.Span.to_float in
+    Lwt_io.printf "{c|%s} {c|%f (%f)}\n"
+      name time (time /. float repetitions)
+  in
+  let lwt_trim_bench file f name =
+    Lwt_list.map_s (fun input_buffer_size ->
+      Lwt_list.map_s (fun output_buffer_size ->
+        let rec iteration = function
+          | 0 -> return ()
+          | n ->
+            f ~input_buffer_size ~output_buffer_size file
+            >>= fun () ->
+            iteration (n - 1)
+        in
+        let start = Time.now () in
+        iteration repetitions >>= fun () ->
+        let time = Time.(diff (now ()) start) in
+        return (input_buffer_size, output_buffer_size, Core.Span.to_float time)
+      ) buffer_sizes
+    ) buffer_sizes
+    >>= fun lwt_io_results ->
+    Lwt_list.iter_s (fun (inbufs, outbufs, time) ->
+      Lwt_io.printf "{c|%s} {c|%f (%f)}\n"
+        (name inbufs outbufs) time (time /. float repetitions))
+      (List.concat lwt_io_results)
+  in
   Lwt_io.printf "{section|Benchmark %S}\n\
                  {b|Started On %s}{p}\n" exp_name Time.(now () |! to_string)
   >>= fun () ->
                    {c h|Experiment}{c h|{#} Reads} {c h|Time (Avg.)}\n"
       file file_lgth64
     >>= fun () ->
-    Lwt_list.map_s (fun buffer_size ->
-      let reads = ref 0 in
-      let rec iteration = function
-        | 0 -> return ()
-        | n ->
-          count_reads ~buffer_size file >>= fun r ->
-          reads := r;
-          iteration (n - 1)
-      in
-      let start = Time.now () in
-      iteration repetitions >>= fun () ->
-      let time = Time.(diff (now ()) start) in
-      return (buffer_size, !reads, Core.Span.to_float time)
-    ) buffer_sizes
-    >>= fun lwt_io_results ->
-    Lwt_list.iter_s (fun (bufs, reads, time) ->
-      Lwt_io.printf "{c|Biocaml_fastq + Lwt_io (buf: %d B)} {c|%d} {c|%f (%f)}\n"
-        bufs reads time (time /. float repetitions))
-      lwt_io_results
+    lwt_count_bench file count_reads (sprintf "Biocaml_fastq + Lwt_io (buf: %d B)")
     >>= fun () ->
-    let bench_non_lwt f name =
-      let reads = ref 0 in
-      let rec iteration = function
-        | 0 -> return ()
-        | n -> reads := f file; iteration (n - 1) in
-      let start = Time.now () in
-      iteration repetitions >>= fun () ->
-      let time = Time.(diff (now ()) start) |! Core.Span.to_float in
-      Lwt_io.printf "{c|%s} {c|%d} {c|%f (%f)}\n"
-        name !reads time (time /. float repetitions)
-    in
-    bench_non_lwt enum_read_counter "{t|File.lines_of |> Enum.hard_count / 4}"
+    lwt_count_bench file lwt_count_lines
+      (sprintf "{t|Lwt_io.read_line / 4} (buf: %d B)")
+    >>= fun () ->
+    non_lwt_count_bench file enum_read_counter
+      "{t|File.lines_of |> Enum.hard_count / 4}"
     >>= fun () ->
-    bench_non_lwt enum_transform_counter_with_lines_of
+    non_lwt_count_bench file enum_transform_counter_with_lines_of
       "{t|File.lines_of |> Biocaml_fastq.enum_parser |> Enum.hard_count}"
     >>= fun () ->
     Lwt_list.iter_s (fun buffer_size ->
-      bench_non_lwt (enum_asking_for_buffer_size ~buffer_size)
+      non_lwt_count_bench file (enum_asking_for_buffer_size ~buffer_size)
+        (sprintf
+           "{t|BatIO.nread %d |> Ad-hoc-line-counter / 4}"
+           buffer_size)) buffer_sizes
+    >>= fun () ->
+    Lwt_list.iter_s (fun buffer_size ->
+      non_lwt_count_bench file (enum_transform_asking_for_buffer_size ~buffer_size)
         (sprintf
            "{t|BatIO.nread %d |> Biocaml_fastq.enum_parser |> Enum.hard_count}"
            buffer_size)) buffer_sizes
     >>= fun () ->
+    Lwt_io.printf "{end}{p}\n"
+  ) files
+  >>= fun () ->
+    
+  Lwt_io.printf "{section 2|Read - Trim - Write}\n" >>= fun () ->
+  Lwt_list.iter_s (fun file ->
+    Lwt_io.file_length file >>= fun file_lgth64 ->
+    Lwt_io.printf "{b|File:} {t|%s} (%Ld B)\n\
+                   {begin table 2}\n\
+                   {c h|Experiment} {c h|Time (Avg.)}\n"
+      file file_lgth64
+    >>= fun () ->
+    lwt_trim_bench file lwt_trimmer
+      (sprintf "Lwt-trimmer (in-buf: %d — out-buf: %d)")
+    >>= fun () ->
+    Lwt_list.iter_s (fun buffer_size ->
+      non_lwt_trim_bench file (enum_trimmer ~input_buffer_size:buffer_size)
+        (sprintf
+           "{t|BatIO.nread %d |> Super-trimmer |> BatIO.nwrite}"
+           buffer_size)) buffer_sizes
+    >>= fun () ->
 
     Lwt_io.printf "{end}{p}\n"
   ) files