JGI_iTagger / lib / iTagger / FastaDb /

Full commit

=head1 NAME

Fasta - Simple object for Fasta sequence


    my $seq=new Fasta( $hdr, $seq );
    print $seq->output;


Object for a single read sequence, with methods for basic manipulation and quality control.

=head1 METHODS

=over 5


package iTagger::FastaDb::Fasta;

use warnings;
use strict;
use constant {
    CHARACTERS_PER_LINE => 80, # for formatting Fasta output
    CLOSE_ENOUGH_TO_END => 6,   # hits of adapters this close to end will result in trimming to be done to end

our $VERSION = 1.0;

=item new $hdr $seq

Initialize new sequence object.


sub new {
    my ($class,$hdr,$seq)=@_;
    chomp $hdr;
    chomp $seq;
    die("Incomplete Fasta record: hdr=$hdr, seq=$seq\n") unless $hdr and $seq;
    # INIT
    my $this={
    bless $this,$class;
    return $this;

=item _parse_header

Extract info from header, which could be formatted in several ways.


sub _parse_header {
    my ($this, $hdr) = @_;
    $this->{id}=undef; # complete ID (e.g. "A/1#GATTACA"); always defined
    $this->{base}=undef; # base ID only (e.g. "A"); always defined
    $this->{pair}=undef; # pair ID only (e.g. "1"); only defined if paired
    $this->{barcode}=undef; # barcode sequence (upper-case); only defined it barcoded

	if ($hdr =~ /^>(\w+_\d+_\d+_\w+_\w+_\w+\/\d+)\/\w+$/)
        # PacBio as of feb 2014
		$this->{base}    = $1;
    } elsif ($hdr =~ /^>(\S+)#([aAtTcCgGnN]+)\/([12])/) { # Illumina barcoded, paired
    } elsif ($hdr =~ /^>(\S+)\/([12])#([aAtTcCgGnN]+)/) { # Illumina barcoded, paired
    } elsif ($hdr =~ /^>(\S+)\/([12])/) { # Illumina paired
    } elsif ($hdr =~ /^>(\S+)#([aAtTcCgGnN]+)/) { # Illumina barcoded, unpaired
    } elsif ( $hdr =~ /^>(\S+)/ )
        $this->{base}=$1; # general

    $this->{id} = $this->{base};
    $this->{id} .= "/".$this->{pair} if $this->{pair};
    $this->{id} .= "#".$this->{barcode} if $this->{barcode};

=item header ($new_hdr)

Returns the object's header line.  You can also use this method to give the sequence a new header.


sub header {
    my ($this,$new_hdr)=@_;
    if (defined($new_hdr) and $new_hdr) {
        $new_hdr = ">$new_hdr" unless $new_hdr =~ /^>/;
    return '>'.$this->{id};

=item id

Returns the object's ID, which is the sequence's unique identifier without comments which may be present in the header.
It cannot be changed directly, but will be updated whenever the header, base, or pair is changed.


sub id { return shift->{id} }

=item base

If the read is paired, returns it's base ID (same for both members of pair); returns undefined if not paired.
Providing the optional argument will change the base (and the id and header).


sub base {
    my ($this,$base)=@_;
    if ( defined($base) ) {
        $this->{id} .= "/".$this->{pair} if $this->{pair};
        $this->{id} .= "#".$this->{barcode} if $this->{barcode}
    return $this->{base};

=item barcode

If the read contains an Illumina molecular barcode ID, it will be returned; otherwise returns undef.
Supplying an optional argument will set the barcode.


sub barcode {
    my ($this,$barcode)=@_;
    if ($barcode) {
        $this->{id} .= "/".$this->{pair} if $this->{pair};
        $this->{id} .= "#".$barcode;
    return $this->{barcode};

=item pair

Returns the read's ord in the pair; undef otherwise.  It may be changed by supplying the optional extra argument.
To clear the pairing information, pass it an empty string, not NULL.


sub pair {
    my ($this,$pair)=@_;
    if (defined($pair)) {
        if ($pair) {
            $this->{id} .= "/".$pair;
            $this->{id} .= "#".$this->{barcode} if $this->{barcode}
        } else {
            # delete pairing information (e.g. singleton)
            $this->{id} .= "#".$this->{barcode} if $this->{barcode}
    return $this->{pair};

=item seq ($new_seq)

Returns the read's complete sequence, without newlines.  Optional argument changes it.


sub seq {
    my ($this,$new_seq)=@_;
    if ($new_seq) {
        chomp $new_seq;
    return $this->{seq};

=item revcomp

Reverse-complements a sequence.


sub revcomp {
    my $this=shift;
    return unless $this->{seq};
    $this->{seq} =~ tr/ATCGatcg/TAGCtagc/;
    my @seq=reverse split(//, $this->{seq});
    $this->{seq}=join('', @seq);

=item output

Returns a multiline string of the sequence in Fasta format.  Returns no output if sequence is empty.


sub output {
    my $this=shift;
    return '' unless $this->{seq}; # will be empty if failed QC
    return $this->header."\n"._format($this->{seq});

sub _format {
    my $old=shift;
    return '' unless $old;
    return $old unless CHARACTERS_PER_LINE;
    my $new='';
    while (length($old)> CHARACTERS_PER_LINE) {
        $new .= substr($old,0, CHARACTERS_PER_LINE)."\n";
        $old = substr($old, CHARACTERS_PER_LINE);
    $new .= $old."\n";
    return $new;

=item len

Returns length of the seq


sub len {
    my $this = shift;
    return length($this->{seq});

=item qc $winsize $meanq $minlen $maxn

To perform minimum length filtering, and filtering reads with too many Ns.


sub qc {
    my ($this, $minlen, $maxn)=@_;

=item trim_terminal_Ns

Discard uninformative Ns from the ends of the sequence.


sub trim_terminal_Ns {
    my $this=shift;
    if ($this->seq =~ /^(N+)(.*)$/) {
    if ($this->seq =~ /^(.*)N+$/) {

=item length_filter

If the sequence is shorter than the minimum length, the sequence string is emptied so they will not be
returned by the output method.  Returns true if sequence was filtered.


sub length_filter {
    my ($this,$minlen)=@_;
    my $len=length($this->{seq});
    if ( !defined($minlen) or $len >= $minlen) { 
        return 0;
    } else {
        return 1;

=item N_filter

If the sequence contains more than the allowed number of Ns, the sequence string is emptied.
Returns true if sequence was filtered.


sub N_filter {
    my ($this,$maxn)=@_;
    return 1 unless defined($maxn);
    my $tmpseq=$this->{seq};
    my $n= $tmpseq=~ s/N//gi;
    if ($n <= $maxn) {
        return 0;
    } else {
        # FAIL
        return 1;

=item low_complexity_filter $pct_len

If the sequence is >= $pct_len mono- or di-nucleotide repeats, clears the sequence string and returns true.


sub low_complexity_filter {
    my ($this,$pct_len)=@_;
    my $seq=$this->{seq};    
    $seq =~ s/\n//g;
    my $len = length($seq);
    my $filter=0;
    foreach my $nn (qw/AA TT CC GG CA GT CT GA AT CG/) {
        my $n = $seq =~ s/$nn/$nn/g;
        if ($n >= $pct_len/200*$len) {
            $filter = 1;
    if ($filter) {
    return $filter;

#=item trim
#Given start and end coordinates of adapter/primer sequence, trims sequence string and returns final length.
#sub trim {
#    my ($this, $start, $end)=@_;
#    return unless defined($start) and defined($end);
#    ($start,$end)=($end,$start) if $end<$start; 
#    my $len=length($this->{seq});
#    #print STDERR "- trim read, ", $this->id, " ($len bp) from $start-$end\n"; # DEBUG
#    if ($start <= CLOSE_ENOUGH_TO_END and $len - $end <= CLOSE_ENOUGH_TO_END ) {
#        # fail read
#        $this->{seq}='';
#    } elsif ($start <= CLOSE_ENOUGH_TO_END ) {
#        # trim left
#        $this->{seq}=substr($this->{seq},$end);
#    } elsif ($len - $end <= CLOSE_ENOUGH_TO_END ) {
#        # trim right
#        $this->{seq}=substr($this->{seq},0,$start);
#    }
#    return length($this->{seq});
#    #print STDERR "\t+ length after trimming is ", length($this->{seq})," bp\n"; # DEBUG



No support for paired reads.


Copyright (c) 2010 U.S. Department of Energy Joint Genome Institute

All right reserved. This program is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

=head1 AUTHOR

Edward Kirton <>