fastq_bench / fastqbench.ml

(*

ocamlfind ocamlopt -thread -package lwt.unix,biocaml,core -linkpkg fastqbench.ml -o fastbench

./fastbench 01.fastq | brtx -doc -o bench.html 

*)

open Core.Std
open Lwt

let failf fmt =
  ksprintf (fun s -> Lwt.fail (Failure s)) fmt
  
let count_next_reads parser =
  let strpos = Biocaml_pos.to_string in
  let rec next_m c =
    match Biocaml_fastq.next parser with
    | `not_ready -> return c
    | `record {Biocaml_fastq. name; sequence; comment; qualities; } ->
      next_m (c + 1)
    | `error (`sequence_and_qualities_do_not_match (l, seq, qs)) ->
      failf "Error  %s: %d bp Vs %d q-scores\n" (strpos l)
        (String.length seq) (String.length qs)
    | `error (`wrong_comment_line (l, _)) ->
      failf "Syntax error %s: (comment line)\n" (strpos l)
    | `error (`wrong_name_line (l, _)) ->
      failf "Syntax error %s: (name line)\n" (strpos l)
  in
  next_m 0

let count_reads ~buffer_size file =
  let parser = Biocaml_transform.Line_oriented.parser ~filename:file () in
  let reads = ref 0 in
  Lwt_io.with_file ~buffer_size ~mode:Lwt_io.input file (fun i ->
    let rec loop () =
      Lwt_io.read ~count:buffer_size i
      >>= fun read_string ->
      if read_string = "" then
        return !reads
      else (
        Biocaml_transform.Line_oriented.feed_string parser read_string;
        count_next_reads parser >>= fun c ->
        reads := !reads + c;
        loop ())
    in
    loop ())

let lwt_count_lines ~buffer_size file =
  let count = ref 0 in
  let read_line i =
    Lwt.catch (fun () -> Lwt_io.read_line  i >>= fun l -> return (Some l))
      (fun _ -> return None) in
  Lwt_io.with_file ~buffer_size ~mode:Lwt_io.input file (fun i ->
    let rec loop () =
      read_line i
      >>= function
      | Some s -> incr count; loop ()
      | None -> return () in
    loop ())
  >>= fun () ->
  return (!count / 4)
    
let enum_read_counter file =
  let e = BatFile.lines_of file in
  let count = BatEnum.hard_count e / 4 in
  count

let enum_transform_counter_with_lines_of  file =
  let e = BatFile.lines_of file in
  let e4buf = BatEnum.map  (sprintf "%s\n") e in
  let tr = Biocaml_fastq.enum_parser ~filename:file  in
  BatEnum.hard_count (tr e4buf)
  
let enum_asking_for_buffer_size ~buffer_size file =
  let i = BatFile.open_in file in
  let e = BatEnum.from (fun () ->
    try
      BatIO.nread i buffer_size 
    with
      BatIO.No_more_input -> raise BatEnum.No_more_elements) in
  let count = ref 0 in
  let ec =
    BatEnum.map (fun s ->
      let lines_there =
        BatString.fold_left (fun a b ->
          a + (if b = '\n' then 1 else 0)) 0 s in
      count := lines_there + !count;
      s
    ) e in
  let _ = BatEnum.hard_count ec in
  !count / 4

let enum_transform_asking_for_buffer_size ~buffer_size file =
  let i = BatFile.open_in file in
  let e = BatEnum.from (fun () ->
    try
      BatIO.nread i buffer_size 
    with
      BatIO.No_more_input -> raise BatEnum.No_more_elements) in
  let tr = Biocaml_fastq.enum_parser ~filename:file  in
  BatEnum.hard_count (tr e)
    
let fastq_file_trimmer filename =
  let counter_transform =
object
  val mutable id =  0
  method feed () = ()
  method next = id <- id + 1; `output id
end in
  Biocaml_transform.(
    (compose
       (mix
          (compose
             (compose
                (new Biocaml_fastq.fastq_parser ~filename ())
                (new Biocaml_fastq.trimmer (`beginning 10)))
             (new Biocaml_fastq.trimmer (`ending 2)))
          counter_transform
          ~f:(fun r c ->
            { r with Biocaml_fastq.name =
                Printf.sprintf "%s -- %d (rand: %f)"
                  r.Biocaml_fastq.name c Random.(float 42.) }))
       (new Biocaml_fastq.fastq_printer)))
(*  string  ---  fastq-record --- trimmed-fast \
                                               f --- named-fastq --- string 
    unit  ---  count --------------------------/                              *)    

exception Trim_error of 
    [ `left of
        [ `left of
            [ `left of
               [ `left of Biocaml_fastq.parser_error
               | `right of [ `invalid_size of int ] ]
            | `right of [ `invalid_size of int ] ]
        | `right of Biocaml_fastq.empty ]
    | `right of Biocaml_fastq.empty ]

let enum_trimmer ~input_buffer_size filename =
  let outfile =
    sprintf "trimenum_%d_r%d_%s" input_buffer_size Random.(int 4242) filename in
  let i = BatFile.open_in filename in
  let e = BatEnum.from (fun () ->
    try
      (BatIO.nread i input_buffer_size, ()) 
    with
      BatIO.No_more_input -> raise BatEnum.No_more_elements) in
  let tr =
    Biocaml_transform.enum_transformation
      ~error_to_exn:(fun e -> raise (Trim_error e))
      (fastq_file_trimmer filename) e in
  BatFile.with_file_out outfile (fun o ->
    BatEnum.iter (fun s ->
      BatIO.nwrite o s;
    ) (tr);
  );
  ()

let lwt_trimmer ~input_buffer_size ~output_buffer_size filename =
  let outfile =
    sprintf "trimlwt_%d-%d_r%d_%s" input_buffer_size output_buffer_size
      Random.(int 4242) filename in
  let transform = fastq_file_trimmer filename in
  Lwt_io.with_file filename
    ~buffer_size:input_buffer_size
    ~mode:Lwt_io.input  (fun i ->
      Lwt_io.with_file outfile
        ~buffer_size:output_buffer_size
        ~mode:Lwt_io.output  (fun o ->
          let rec loop () =
            Lwt_io.read ~count:input_buffer_size i
            >>= fun read_string ->
            if read_string = "" then
              return ()
            else (
              transform#feed (read_string, ());
              let rec subloop () =
                match transform#next with
                | `output s ->
                  Lwt_io.write o s >>= fun () ->
                  subloop ()
                | `not_ready ->
                  return ()
                | `error e -> Lwt.fail (Trim_error e)
              in
              subloop ()
              >>= fun () ->
              loop ())
          in
          loop ()))
  
    
let do_bench exp_name repetitions buffer_sizes files =
  let lwt_count_bench file f name =
    Lwt_list.map_s (fun buffer_size ->
      let reads = ref 0 in
      let rec iteration = function
        | 0 -> return ()
        | n ->
          f ~buffer_size file >>= fun r ->
          reads := r;
          iteration (n - 1)
      in
      let start = Time.now () in
      iteration repetitions >>= fun () ->
      let time = Time.(diff (now ()) start) in
      return (buffer_size, !reads, Core.Span.to_float time)
    ) buffer_sizes
    >>= fun lwt_io_results ->
    Lwt_list.iter_s (fun (bufs, reads, time) ->
      Lwt_io.printf "{c|%s} {c|%d} {c|%f (%f)}\n%!"
        (name bufs) reads time (time /. float repetitions))
      lwt_io_results
  in
  let non_lwt_count_bench file f name =
    let reads = ref 0 in
    let rec iteration = function
      | 0 -> return ()
      | n -> reads := f file; iteration (n - 1) in
    let start = Time.now () in
    iteration repetitions >>= fun () ->
    let time = Time.(diff (now ()) start) |! Core.Span.to_float in
    Lwt_io.printf "{c|%s} {c|%d} {c|%f (%f)}\n%!"
      name !reads time (time /. float repetitions)
  in
  let non_lwt_trim_bench file f name =
    let rec iteration = function
      | 0 -> return ()
      | n -> f file; iteration (n - 1) in
    let start = Time.now () in
    iteration repetitions >>= fun () ->
    let time = Time.(diff (now ()) start) |! Core.Span.to_float in
    Lwt_io.printf "{c|%s} {c|%f (%f)}\n%!"
      name time (time /. float repetitions)
  in
  let lwt_trim_bench file f name =
    Lwt_list.map_s (fun input_buffer_size ->
      Lwt_list.map_s (fun output_buffer_size ->
        let rec iteration = function
          | 0 -> return ()
          | n ->
            f ~input_buffer_size ~output_buffer_size file
            >>= fun () ->
            iteration (n - 1)
        in
        let start = Time.now () in
        iteration repetitions >>= fun () ->
        let time = Time.(diff (now ()) start) in
        return (input_buffer_size, output_buffer_size, Core.Span.to_float time)
      ) buffer_sizes
    ) buffer_sizes
    >>= fun lwt_io_results ->
    Lwt_list.iter_s (fun (inbufs, outbufs, time) ->
      Lwt_io.printf "{c|%s} {c|%f (%f)}\n%!"
        (name inbufs outbufs) time (time /. float repetitions))
      (List.concat lwt_io_results)
  in
  Lwt_io.printf "{section|Benchmark %S}\n\
                 {b|Started On %s}{p}\n" exp_name Time.(now () |! to_string)
  >>= fun () ->
  Lwt_io.printf "{section 2|Info} {list|\n\
                 {*} Repetitions: %d\n\
                 {*} Buffer-sizes: %s\n\
                 {*} Files: {br} %s\n\
                 }{p}{t|$ df hT .}{code}\n"
    repetitions (String.concat ~sep:", " (List.map buffer_sizes (sprintf "%d")))
    (String.concat ~sep:"{br} " (List.map files (sprintf "{t|%s}")))
  >>= fun () ->
  Lwt_unix.system "df -hT ." >>= fun _ ->
  Lwt_io.printf "{end}{p}\n{t|$ uname -a}{code}\n" >>= fun () ->
  Lwt_unix.system "uname -a" >>= fun _ ->
  Lwt_io.printf "{end}{p}{t|Lwt_sys.have `libev = %b}\n" (Lwt_sys.have `libev)
  >>= fun () ->
  Lwt_io.printf "{section 2|Count Reads}\n" >>= fun () ->
  Lwt_list.iter_s (fun file ->
    Lwt_io.file_length file >>= fun file_lgth64 ->
    (* one first "caching" run, to be more fair: *)
    count_reads ~buffer_size:1_000_00 file
    >>= fun _ ->
    Lwt_io.printf "{b|File:} {t|%s} (%Ld B)\n\
                   {begin table 3}\n\
                   {c h|Experiment}{c h|{#} Reads} {c h|Time (Avg.)}\n"
      file file_lgth64
    >>= fun () ->
    lwt_count_bench file count_reads (sprintf "Biocaml_fastq + Lwt_io (buf: %d B)")
    >>= fun () ->
    lwt_count_bench file lwt_count_lines
      (sprintf "{t|Lwt_io.read_line / 4} (buf: %d B)")
    >>= fun () ->
    non_lwt_count_bench file enum_read_counter
      "{t|File.lines_of |> Enum.hard_count / 4}"
    >>= fun () ->
    non_lwt_count_bench file enum_transform_counter_with_lines_of
      "{t|File.lines_of |> Biocaml_fastq.enum_parser |> Enum.hard_count}"
    >>= fun () ->
    Lwt_list.iter_s (fun buffer_size ->
      non_lwt_count_bench file (enum_asking_for_buffer_size ~buffer_size)
        (sprintf
           "{t|BatIO.nread %d |> Ad-hoc-line-counter / 4}"
           buffer_size)) buffer_sizes
    >>= fun () ->
    Lwt_list.iter_s (fun buffer_size ->
      non_lwt_count_bench file (enum_transform_asking_for_buffer_size ~buffer_size)
        (sprintf
           "{t|BatIO.nread %d |> Biocaml_fastq.enum_parser |> Enum.hard_count}"
           buffer_size)) buffer_sizes
    >>= fun () ->
    Lwt_io.printf "{end}{p}\n"
  ) files
  >>= fun () ->
    
  Lwt_io.printf "{section 2|Read - Trim - Write}\n" >>= fun () ->
  Lwt_list.iter_s (fun file ->
    Lwt_io.file_length file >>= fun file_lgth64 ->
    Lwt_io.printf "{b|File:} {t|%s} (%Ld B)\n\
                   {begin table 2}\n\
                   {c h|Experiment} {c h|Time (Avg.)}\n"
      file file_lgth64
    >>= fun () ->
    lwt_trim_bench file lwt_trimmer
      (sprintf "Lwt-trimmer (in-buf: %d — out-buf: %d)")
    >>= fun () ->
    Lwt_list.iter_s (fun buffer_size ->
      non_lwt_trim_bench file (enum_trimmer ~input_buffer_size:buffer_size)
        (sprintf
           "{t|BatIO.nread %d |> Super-trimmer |> BatIO.nwrite}"
           buffer_size)) buffer_sizes
    >>= fun () ->

    Lwt_io.printf "{end}{p}\n"
  ) files
  >>= fun () ->
  Lwt_io.printf "{p}{b|Ended On %s}\n" Time.(now () |! to_string)
let () =
  let repetitions = ref 1 in
  let buffer_sizes = ref [1024] in
  let filenames = ref [] in
  let exp_name = ref "" in
  let options = [
    ("-name", Arg.Set_string exp_name, "<name>\n\tGive a name to the benchmark");
    ("-buffer-sizes", Arg.String (fun s ->
      buffer_sizes :=
        List.map (String.split ~on:',' s) Int.of_string),
     "<sizes>\n\tComa-separated list of buffer sizes");
    ("-repetitions", Arg.Set_int repetitions,
     "<nb>\n\tNumber of repetitions (per buf-size and per file)");
  ] in
  let anon s = filenames := s :: !filenames in
  Arg.parse options anon "Usage: see -help";
  if !filenames = [] then
    eprintf "Nothing to do\n"
  else
    Lwt_main.run (do_bench !exp_name !repetitions !buffer_sizes !filenames)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.