Commits

Edward Kirton  committed 0336b67

add EXP_ERR_PER_KB option, improved length filter, also fixed seqobs outfile name for unpaired run.

  • Participants
  • Parent commits 9fee990

Comments (0)

Files changed (4)

File lib/iTagger/FastqDb.pm

 		low_q          => undef,    # Q-score considered low for Q-count filter
 		max_num_low_q  => undef,    # Q-count filters maximum number of low-qual bases
         trim3_exp_err  => undef,    # trim 3' once 5' bp exceed max err
-        exp_err        => undef,    # filter read if expected error exceed threshold
+        exp_err        => undef,    # filter read if expected error exceed threshold (absolute)
+        exp_err_per_kb => undef,    # filter read if expected error exceed threshold (relative)
         trim_to        => undef,    # trim read to exactly this many bp
 		learn_barcodes     => 0,        # learn barcodes from sample if true
 		barcodes_file      => undef,    # path to barcode sequences in Fasta or Tsv format

File lib/iTagger/FastqDb/Fastq.pm

     return 0 unless $this->low_complexity($this->{qc}->{low_complexity});
     return 0 unless $this->low_q($this->{qc}->{low_q});
     return 0 unless $this->exp_err($this->{qc}->{exp_err});
+    return 0 unless $this->exp_err_per_kb($this->{qc}->{exp_err_per_kb});
     return 1;
 }
 
     return 1;
 }
 
+=item exp_err_per_kb
+
+Fail read if expected number of errors exceeds threshold.
+
+=cut
+
+sub exp_err_per_kb
+{
+    my ($this, $max_exp_err_per_kb) = @_;
+    return 1 unless $max_exp_err_per_kb;
+    my $max_exp_err = int($this->len * $max_exp_err_per_kb/1000 + 0.5);
+    my $exp_err = 0;
+    my @qs = split(//, $this->{qual});
+    foreach my $s (@qs)
+    {
+        my $q = ord($s) - 33;
+        my $ee = 10 ** (($q * -1)/10);
+        $exp_err += $ee;
+        return $this->del('expected_error_per_kb') if $exp_err > $max_exp_err;
+    }
+    return 1;
+}
+
 =item trim_to
 
 Trim read to exactly this many bp by discarding 3' bases.

File lib/iTagger/FastqDb/Tiny.pm

 		low_q          => undef,    # Q-score considered low for Q-count filter
 		max_num_low_q  => undef,    # Q-count filters maximum number of low-qual bases
         trim3_exp_err  => undef,    # trim 3' once 5' bp exceed max err
-        exp_err        => undef,    # filter read if expected error exceed threshold
+        exp_err        => undef,    # filter read if expected error exceed threshold (absolute)
+        exp_err_per_kb => undef,    # filter read if expected error exceed threshold (relative)
         trim_to        => undef     # trim read to exactly this many bp
 	);
 

File lib/iTagger/ReadQC.pm

 use iTagger::FastqDb::Tiny;
 use iTagger::Stats;
 
-our $VERSION = 1.1;
+our $VERSION = 1.11;
 our @ISA = qw(Exporter);
 our @EXPORT = qw(new libQC);
 
     return unless $numInput;
 
     # FILTER CONTAMINANTS.  THERE MAY BE ZERO OR MORE CONTAM DB FILES.
-    my $noContamFile = "$dir/noContam.fastq";
+    my $noContamFile;
     my $numNoContam;
     if ( exists($config->{DUK}->{CONTAM_DB}) )
     {
+        $noContamFile = "$dir/noContam.fastq";
         my $numNoContamAR;
         my $contamDbNamesAR;
         ($numNoContamAR, $contamDbNamesAR) = $this->duk($inFile, $noContamFile, "$dir/duk.log");
         }
     } else
     {
+        $noContamFile = "$tmpdir/noContam.fastq";
         $numNoContam = $numInput;
         symlink($inFile, $noContamFile) or $this->logdie($!);
     }
         return unless $numPrimerTrim;
     } else
     {
+        $numPrimerTrim = $numNoContam;
         symlink($noContamFile, $primerTrimPassFile) or $this->logdie("Symlink failure");
     }
 
         return unless $numHiQual;
 
         # DEREPLICATE
-        my $numUniq = $this->seqObs($hiQualFile, "$dir/qualStat.tsv", "$dir/qualStat.pdf", 'High quality sequences', "$dir/qc.seqobs.tsv.bz2");
+        my $numUniq = $this->seqObs($hiQualFile, "$dir/qualStat.tsv", "$dir/qualStat.pdf", 'High quality sequences', "$dir/seqobs.tsv.bz2");
         $this->log("High quality, unique", $numUniq, $numHiQual);
 
         # FINISH
     $this->logdie("lengthFilter: infile, outfile required") unless $inFile and $outFile;
     my $trim5 = $this->_set('AMPLICON', 'TRIM5', 0);
     my $trim3 = $this->_set('AMPLICON', 'TRIM3', 0);
+    my $minFragLen = $this->_set('AMPLICON', 'LEN_MIN', 0); # undocumented option for testing only
+    my $maxFragLen = $this->_set('AMPLICON', 'LEN_MAX', 0); # undocumented option for testing only
+    my $meanFragLen = $this->_set('AMPLICON', 'LEN_MEAN');
+    my $stdevFragLen = $this->_set('AMPLICON', 'LEN_STDEV');
+    my $minLen = MIN_FINAL_LEN;
+    if ( $minFragLen )
+    {
+        $minLen = $minFragLen;
+    } else
+    {
+        $minLen = $meanFragLen - 3 * $stdevFragLen;
+    }
+    $minLen = MIN_FINAL_LEN if $minLen < MIN_FINAL_LEN;
+    my $maxLen = $maxFragLen ? $maxFragLen : ( $meanFragLen + 3 * $stdevFragLen );
 
     my $numPass = my $numFail = 0;
     my $db = new iTagger::FastqDb::Tiny($inFile);
     {
         $seq->trim5_bp($trim5) if $trim5;
         $seq->trim3_bp($trim3) if $trim3;
-        if ( $seq->filtered or $seq->len < MIN_FINAL_LEN )
+        if ( $seq->filtered or $seq->len < $minLen or $seq->len > $maxLen )
         {
             ++$numFail;
         } else
 sub expErrFilter
 {
     my ($this, $inFile, $outFile) = @_;
-    my $ee = $this->_set('AMPQC', 'EXP_ERR', 1.0);
+
+    # DEFINE STRINGENCY
+    my %params = ();
+    my $ee = $this->_set('AMPQC', 'EXP_ERR', 0);
+    my $ee_per_kb = $this->_set('AMPQC', 'EXP_ERR_PER_KB', 20);
+    if ( $ee ) { $params{exp_err} = $ee }
+    else { $params{exp_err_per_kb} = $ee_per_kb }
+
+    # FILTER
     open(my $out, '>', $outFile) or $this->logdie($!);
-    my $in = new iTagger::FastqDb::Tiny($inFile, { exp_err => $ee });
+    my $in = new iTagger::FastqDb::Tiny($inFile, \%params);
     while ( my $rec = $in->next_seq ) { print $out $rec->output }
     $in->close_file;
     close($out);