Source

JGI_iTagger / lib / iTagger / QIIME.pm

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
=pod

=head1 NAME

iTagger::QIIME

=head1 DESCRIPTION

Functions for generating OTU reports, wrapping several QIIME scripts.

=head1 FUNCTIONS

=over 5

=cut

package iTagger::QIIME;

use strict;
use warnings;
use Env qw(TMPDIR);
use Date::Format;
use JSON;
use File::Copy;
use threads;
require Exporter;
use iTagger::Stats qw(trimmedStdDev);
use iTagger::Logger qw(die warn info debug error);

our $VERSION = 1.0;
our @ISA = qw(Exporter);
our @EXPORT = qw(qiimeReport);
our $tmpDir = -e '/scratch' ? '/scratch' : $TMPDIR;
our $logger;

=item qiimeReport

Wraps several QIIME scripts

=cut

sub qiimeReport
{
    my ($logConfigFile, $configFile, $dir) = @_;
    $logger = new iTagger::Logger('iTagger::QIIME', $logConfigFile);
    $logger->die("Missing args") unless $configFile and $dir;
    my $start = time;
    my $config = Config::Tiny->read($configFile);

    # CONVERT OTU TSV TO BIOM JSON
    my $inFile = "$dir/otu/otu.tax.filtered.tsv";
    my $biomFile = "$dir/otu/otu.biom";
    my $depth = otuToBiom($inFile, $biomFile);

    ## BEGIN THREADS
    my @threads;
    my $thr;

    # FILTER OTU TABLES AT ASSORTED LEVELS, CALC ALPHA DIVERSITY LEVELS
	($thr) = new threads(\&abundanceThreshold, $inFile, "$dir/abundance_thresholds");
    push @threads, $thr;

    # RAREFY OTU TABLES
    my $rarefiedFile = "$dir/otu/rarefied.$depth.biom";
    my $rarefiedFilteredFile = "$dir/otu/rarefied.$depth.filtered.biom";
    $thr = new threads(\&singleRarefaction, $biomFile, $rarefiedFile, $rarefiedFilteredFile, $depth);
    push @threads, $thr;
    my $rarefied1kFile = "$dir/otu/rarefied.1000.biom";
    my $rarefied1kFilteredFile = "$dir/otu/rarefied.1000.filtered.biom";
    $thr = new threads(\&singleRarefaction, $biomFile, $rarefied1kFile, $rarefied1kFilteredFile, 1000);
	push @threads, $thr;

    # SUMMARIZE TAXONOMY WITH ABSOLUTE AND RELATIVE ABUNDANCE
    my $relDir = "$dir/tax_mapping/relative";
    $thr = new threads(\&summarizeTaxa, $biomFile, $relDir);
    push @threads, $thr;
    my $absDir = "$dir/tax_mapping/absolute";
    summarizeTaxa($biomFile, $absDir, 1);

    # PLOT THE RANK-ABUNDANCE CURVE OF ALL SAMPLES
    $thr = new threads(\&plotRankAbundanceGraph, $biomFile, "$dir/otu/log_rank_abundance.pdf");
	push @threads, $thr;

    # MAKE TAXA PLOT ABSOLUTE ABUNDANCE
    my @files = map { "$dir/tax_mapping/absolute/otu_L$_.txt" } (2..6);
    my $outDir = "$dir/tax_mapping/plots";
    $thr = new threads(\&plotTaxaSummary, \@files, $outDir);
    push @threads, $thr;

    # JOIN THREADS
    while ( $thr = shift @threads) { $thr->join }

    # GENERATE A PHYLUM BARPLOT FOR A QUICK VIEW OF HOW THE DATA LOOKS
	_run("itaggerPhylumBarplot.pl --infile $dir/tax_mapping/absolute/otu_L2.txt --outFile_graph $dir/tax_mapping/taxonomy_phylum_L2.pdf --outFile_table $dir/tax_mapping/taxonomy_phylum_L2.tab --title Phylum");

    # FINISH
    my $dur = time - $start;
    $logger->info("QIIME done after $dur sec");
}

=item summarizeTaxa

Execute QIIME summarize_taxa.py

=cut

sub summarizeTaxa
{
    my ($inFile, $outDir, $abs) = @_;
    $abs = $abs ? '-a' : '';
    _run("summarize_taxa.py -i $inFile -o $outDir $abs");
}

=item plotRankAbundanceGraph

Execute QIIME plot_rank_abundance_graph.py

=cut

sub plotRankAbundanceGraph
{
    my ($inFile, $outFile) = @_;
	_run("plot_rank_abundance_graph.py -n -s '*' -i $inFile -o $outFile");
    unlink("${outFile}_log.txt");
}

=item plotTaxaSummary

Execute QIIME plot_taxa_summary.py

=cut

sub plotTaxaSummary
{
    my ($inFiles, $outDir) = @_;
    _run("plot_taxa_summary.py -i ".join(',', @$inFiles)." -o $outDir -c bar");
}

=item _run

Run an external executable, die upon nonzero exit status.

=cut

sub _run
{
    my ($cmd) = @_;
    my $output = `$cmd 2>&1`;
    $logger->die("FAILURE RUNNING $cmd : $output") unless $? == 0;
}

=item otuToBiom

Output biom json file.

=cut

sub otuToBiom
{
    my ($inFile, $outFile) = @_;
    $logger->die("Missing args") unless $inFile and $outFile;

    # INIT JSON RECORD
    my $biom = 
    {
        id => undef,
        format => "Biological Observation Matrix 0.9.1-dev",
        format_url => "http://biom-format.org/documentation/format_versions/biom-1.0.html",
        type => "OTU table",
        generated_by => "iTagger revision $VERSION",
        date => time2str('%Y-%m-%dT%X', time),
        matrix_type => "dense",
        matrix_element_type => "int"
    };

    # LIBRARIES
    open ( my $in, '<', $inFile) or $logger->die($!);
    my $hdr = <$in>;
    my @hdr = split(/\t/, $hdr);
    my @libs = @hdr[1..($#hdr-1)];
    for (my $i=0; $i<=$#libs; $i++)
    {
        $libs[$i] = { id => $libs[$i], metadata => {} };
    }
    $biom->{columns} = \@libs;
    my $numLibs = scalar(@libs);

    # OTUS JSON
    my @rows = ();
    my @data = ();
    my @libSizes = ( (0) x $numLibs );
    while (<$in>)
    {
        chomp;
        my @obs = split(/\t/);
        my $id = shift @obs;
        my $tax = pop @obs;
        my @tax = split(/;/, $tax);
        push @rows, { id => $id, metadata => { taxonomy => \@tax } };
        push @data, \@obs;
        for ( my $i=0; $i<=$#obs; $i++ ) { $libSizes[$i] += $obs[$i] }
    }
    close($in);
    my $numOtus = scalar(@rows);
    $biom->{rows} = \@rows;
    $biom->{data} = \@data;
    $biom->{shape} = [ $numOtus, $numLibs ];

    # WRITE
    open ( my $out, '>', $outFile) or $logger->die($!);
    print $out to_json($biom);
    close($out);

    # CALCULATE CUTOFF OF RAREFACTION OF TRIMMED MEAN
    # (DISCARDING SMALLEST 5% AND LARGEST 5% OF VALUES)
    my ($trStDev, $mean) = trimmedStdDev(\@libSizes, 0.05);
    $logger->die("Unable to calculate mean and stdev of trimmed lib sizes list") unless $mean;
    my $cutoff = 0;
    my $i = 2;
    while ( $cutoff <= 0 )
    {
        $cutoff = int($mean - $i * $trStDev + 0.5);
        $i -= 0.5;
    }
	return $cutoff;
}

=item singleRarefaction

Execute QIIME single_rarefaction.py

=cut

sub singleRarefaction
{
    my ($inFile, $outFile, $outFile2x2, $depth) = @_;
	_run("single_rarefaction.py -i $inFile -o $outFile -d $depth");
    filterOtuTable($outFile, $outFile2x2, 2, 2);
}

=item filterOtuTable

Filter OTUs that don't have an abundance of at least <threshold> at least <frequency> times.  For instance, keep OTUs that have at least 10 reads in 2 samples.  The input OTU table is expected to have the OTU ID in the first column and it's taxonomic classification in the last column.

=cut

sub filterOtuTable
{
    my ($inFile, $outFile, $threshold, $frequency) = @_;
    $logger->die("Missing args") unless $inFile and $outFile and $threshold and $frequency;
	open(my $in, '<', $inFile) or $logger->die($!);
	open(my $out, '>', $outFile) or $logger->die($!);
    my $line = <$in>;
    print $out $line;
	while( my $line = <$in> )
    {
        my @row = split(/\t/, $line);
        my $count = 0;
        for (my $i=1; $i<$#row; $i++)
        {
            if ( $row[$i] >= $threshold )
            {
                if ( ++$count >= $frequency )
                {
                    print $out $line;
                    last;
                }
            }

        }
	}
	close($in);
	close($out);
}

=item abundanceThreshold

From one OTU table, generate new OTU tables having OTUs above certain abundance threshold only. Abundance threshold calculated will be 10e-6, 10e-5, 10e-4, 10e-3, 10e-2 and 10e-1.

Then, generate a table having different alpha divesity values for each 
abundance threshold values.

=cut

sub abundanceThreshold
{
    my ($inFile, $outDir) = @_;
    $logger->die("Missing args") unless $inFile and $outDir;

    # LOAD OTUS, CALC LIB SIZES
    my @otus = ();
    my @sizes = ();
    my $total = 0;
    open(my $in, '<', $inFile) or $logger->die($!);
    my $hdr = <$in>;
    $logger->die("Invalid OTU file") unless $hdr =~ /^#/;
    while (<$in>)
    {
        push @otus, $_;
        my @row = split(/\t/);
        my $sum = 0;
        for ( my $i=1; $i<$#row; $i++ ) { $sum += $row[$i] }
        push @sizes, $sum;
        $total += $sum;
    }
    close($in);

    # OUTPUT FILTERED OTU TABLES
    # keep only OTUs having abundance higher than xx%
    -d $outDir or mkdir($outDir) or $logger->die($!);
    my @thresholds = (1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1);
    my @otuFiles = map { "$outDir/otu.$_.tsv" } @thresholds;
    my @biomFiles = map { "$outDir/otu.$_.biom" } @thresholds;
    my @alphaFiles = map { "$outDir/otu.$_.alpha.tsv" } @thresholds;
    my @fhs;
    foreach my $otuFile (@otuFiles)
    {
        open(my $fh, '>', $otuFile) or $logger->die($!);
        print $fh $hdr;
        push @fhs, $fh;
    }
    while (@otus)
    {
        my $otu = shift @otus;
        my $size = shift @sizes;
        my $fraction = $size / $total;
        for ( my $i=0; $i<=$#thresholds; $i++ )
        {
            next unless $fraction > $thresholds[$i];
            my $fh = $fhs[$i];
            print $fh $otu;
        }
    }
    foreach my $fh (@fhs) { close($fh) }
    for (my $i=0; $i<=$#thresholds; $i++)
    {
        ## Check if OTU table is empty.
        open(my $in, '<', $otuFiles[$i]) or $logger->die($!);
        my $ok = -1;
        while (<$in>) { last if ++$ok }
        close($in);
        if ( $ok )
        {
            otuToBiom($otuFiles[$i], $biomFiles[$i]);
        } else
        {
            unlink($otuFiles[$i]);
            $otuFiles[$i] = $biomFiles[$i] = $alphaFiles[$i] = undef;
        }
    }

    # CALC ALPHA DIVERSITY
    my @threads;
    for ( my $i=0; $i<=$#thresholds; $i++ )
    {
        next unless defined($biomFiles[$i]);
        my ($thr) = new threads(\&alphaDiversity, $biomFiles[$i], $alphaFiles[$i]);
        push @threads, $thr;
    }
    foreach my $thr (@threads) { $thr->join }
}

=item alphaDiversity

Execute QIIME alpha_diversity.py and reformat header line.

=cut

sub alphaDiversity
{
    my ($inFile, $outFile) = @_;
    _run("alpha_diversity.py -i $inFile -o $outFile -m chao1,observed_species,shannon,simpson");
    # REFORMAT HEADER LINE
    open(my $in, '<', $outFile) or $logger->die("Unable to open $outFile: $!");
    open(my $out, '>', "$outFile.tmp") or $logger->die($!);
    print $out "#Sample\tchao1\tobserved_species\tshannon\tsimpson\n";
    my $head = <$in>;
    while (<$in>) { print $out $_ }
    close($in);
    close($out);
    move("$outFile.tmp", $outFile);
}

=item _set

Return value from config or default value.  If default not defined, it must be defined in the config file, otherwise die.

=cut

sub _set
{
    my ($config, $part, $key, $default) = @_;
    $logger->die("Missing args") unless $config and $part and $key;
    if ( !exists($config->{$part}) )
    {
        $config->{$part} = {};
        return $config->{$part}->{$key} = $default if defined($default);
        $logger->die("Config file missing section, $part");
    } elsif ( exists($config->{$part}->{$key}) )
    {
        return $config->{$part}->{$key}
    } elsif ( defined($default) )
    {
        return $config->{$part}->{$key} = $default;
    } else
    {
        $logger->die("Config missing required $part/$key");
    }
}

# NYI

=item calcMinDepths

If a fraction is provided, it will return the minimum OTU depth which will contain that fraction of reads.  Otherwise, it will return a list of minimum depths which capture 10%, 20%, ..., 90% of reads.

=cut

sub calcMinDepths
{
    my ($inFile, $pctAR) = @_;
    $logger->die("Missing args") unless $inFile and @$pctAR;
    # CALC TOTAL SIZE
    open( my $in, '<', $inFile ) or $logger->die("Can't open file, $inFile: $!");
    my $hdr = <$in>;
    my @sizes = ();
    my $totSize = 0;
    while (<$in>)
    {
        my @row = split(/\t/);
        my $size = 0;
        for ( my $i = 1; $i < $#row; $i++ ) { $size += $row[$i] // 0 }
        push @sizes, $size;
        $totSize += $size;
    }
    close($in);
    $logger->die("Empty infile") unless $totSize;
    # CALC MIN DEPTHS
    my $sum = 0;
    my $minDepth;
    my @minDepths;
    my @x = map { $_/100 } @$pctAR;
    foreach my $x (@x)
    {
        my $target = $totSize * $x;
        while ($sum < $target) { $sum += $minDepth = shift @sizes }
        push @minDepths, $minDepth;
    }
    return @minDepths;
}

1;

=back

=head1 AUTHORS

Julien Tremblay, Edward Kirton

=head1 COPYRIGHT

Copyright (c) 2013 US DOE Joint Genome Institute.  Use freely under the same license as QIIME itself.

=cut

__END__