Wiki

Clone wiki

metabit / Documentation-makefile

Documentation of makefile parsed by metaBIT

#!yaml

# -*- mode: Yaml; -*-

# Please respect indentation (with spaces), and pay attention to colons and hyphens.
# Hash-commented lines are ignored.

# ----------------------------------------------- #
#                   Samples                       #
# ----------------------------------------------- #

# The expected format for sequence files, trimmed for adapters,
# is fastq (gz and bz2 compressions are allowed).

Samples:
  SamplePE:  # User-defined name for the first sample. Sequence reads for this sample are paired-end reads.

    # Library paths.
    # Here, sequencing reads were trimmed and collapsed using AdapterRemoval
    # (Lindgren S. BMC Res Notes 2012, 5:337)

    # There are two ways to specify read paths:

    # 1st way - Provide an explicit list of files using the AdapterRemoval keys 
    # "Collapsed" and "Paired" for paired-end reads, "Singles" for single-end reads: 

    Lib1:  # User-defined name for the first library.
      Collapsed: # Here, we use collapsed overlapping read pairs.
        # Here, we give a list (must include at least one item):
        - folder/Lib1/date1PE/lane*/reads.collapsed.gz  # Wildcards are allowed
        - folder/Lib1/date2PE/lane*/reads.collapsed.gz
      # In cases where only one item is considered, it can be provided as a single
      # string after the colon, as follows:
      Collapsed: folder/Lib1/date1PE/lane*/reads.collapsed.gz

      # Lists or single strings can be used regardless of the type of data considered
      # (i.e. Collapsed, Singles, Paired); in the following, we use strings.

      # Non-overlapping paired-end reads can also be analysed.
      # The string {Pair} equates to 1 and 2.
      Paired: folder/Lib1/date1/lane*/reads.pair{Pair}.gz

      # Singletons (reads whose mate has been removed during trimming) can also be analysed.
      Singles: folder/Lib1/date1/lane*/reads.singleton.gz
      # For Single-end data, also use the 'Singles' key. 

    # 2nd way  Provide a string or a list of folders containing read files named
    # according to the format described above, corresponding to default outputs of
    # AdapterRemoval:
    #     reads.collapsed.gz
    #     reads.pair{Pair}.gz
    Lib2: folder/Lib1/date1

  SampleSE:  # User-defined name for the second sample. Sequence reads for this sample are single-end reads.
    Lib1SE:
      Singles: folder/Lib1SE/date1SE/lane*/reads.singleton.gz # For Single-end data, use the 'Singles' key



# In addition to the Sample: and Library: levels, groups can be nested within each
# other, creating as many levels as required by the user as long as the name starts
# with 'G_'. These groups will be used in the LEfSe (max one level of subclasses).

# example:

Samples:
  G_MyGroup1:
    G_MySubGroup1-1:
      Sample1:
        Lib1: path/to/sample1/lib1
        Lib2: path/to/sample1/lib2
    G_MySubGroup1-2:
      Sample2:
        Lib1: path/to/sample2/lib1
        Lib2: path/to/sample2/lib2
  G_MyGroup2:
    G_MySubGroup2-1:
      Sample3:
        ...
    G_MySubGroup2-2:
      Sample4:
        ...

# ----------------------------------------------- #
#               Taxonomic profiling               #
# ----------------------------------------------- #

# Types of files to keep from PE libraries
keepfromPE:
  # This information is overriden if files are given in the 'Samples' section using the
  # fields Collapsed, Paired or Singles.
  # Default: Collapsed: yes   Paired: yes   Singles: yes
  Collapsed: yes
  Paired: yes
  Singles: yes

Bowtie2:
  # any Bowtie2 option can be explicited.
  # quality scores: --phred33 (default) or --phred64 or --solexa-quals or --int-quals
  --phred33: yes  # Sets the option. Bowtie2 default.
  #--phred:       # works too.


Metaphlan:
  # Before running MetaPhlAn, libraries from a given sample can be pooled. (abundances 
  # are calculated for the pooled data).
  # Uncomment the following, and edit sample names accordingly if all sequence datasets
  # available for a given sample shall be considered together:

  Pool: SamplePE

  # The special value '*' will pool libraries within each sample (quotes required):
  Pool: '*'

  # Lists are also allowed:
  Pool:
    - SamplePE
    - SampleSE

  # Any MetaPhlAn command-line option can be given here:
  --ignore_eukaryotes:   # comment to include eukaryotes (MetaPhlAn2 only)
  --ignore_viruses:      # comment to include viruses (MetaPhlAn2 only)



# ----------------------------------------------- #
# Statistical Analysis of the taxonomic profiles  #
# ----------------------------------------------- #

# In case you want to recycle taxonomic tables from previous runs and complete
# additional analyses, the path to the input table must be specified here under
# the 'run_from_table:' level:
run_from_table: path/to/relative/abundance/table/all_taxa.tsv # the name of the table can be changed

Krona:
  #run: no  # will not produce Krona visualization files when uncommented
  -a: no    # If you uncomment, Krona charts will require an internet connection to use Krona resources. 

Statax:
  # Statistical analyses to be run:
  MyAnalysis1: # name used as a prefix for all output files corresponding to the statistical analyses requested.
    # taxonomic levels to be considered: "k" kingdom, "p" phylum, "c" class, "o" order,
    # "f" family, "g" genus, "s" species, "t" strain). Default: pcofgs
    taxlevels: pcofgs
    filterout: 1  # removes taxa with relative abundances <1% across all the samples analysed. Default = 1
    # Compute diversity index for each sample and taxlevel:
    doDiv:
      # Under "doDiv:", you can give any command-line option of the corresponding
      # R script, which can be found in the metaBIT package in
      # nodes/tools/statax_Rmodule/doDiv.R (see doDiv.R --help).
      # The same is valid for "doBarplot", "doHeatmap", "doPcoa", "doClust".

      # default index is shannon; simpson and invsimpson are also allowed
      --index: shannon
    # Create barplots of abundances at all taxlevels:
    doBarplot:
    # Create heatmaps of abundances at all taxlevels:
    doHeatmap:
    # Perform Principal Coordinates Analysis at all taxlevels:
    doPcoa:
      # default distance method = bray
      # Other distances can be selected: "manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn", "mountford",  "raup", "binomial", "chao", "cao", and "mahalanobis".
      --distance: bray
    # Perform Hierarchical clustering at all taxlevels:
    doClust:
      # default distance method = bray
      --method.dist: bray
      # All distances above are also allowed, as well as "correlation", "uncentered",
      # "abscor", "maximum", "binary" and "minkowski".
      # default clustering method = average
      --method.hclust: average
      # Other methods are allowed: "average", "ward", "single", "complete", "mcquitty", "median" and "centroid"
      # default number of pseudoreplicates for approximately unbiased and bootstrap supports = 10000.
      --nboot: 10000

    MyAnalysis2: # Name of an other set of statistical analyses, if another set is requested.
    # Samples can be added for comparison if the path to the corresponding table of
    # relative abundances generated in a previous metaBIT run is indicated.
    # Example: MetaPhlAn 1 and 2 microbial profiles of human body sites by the Human Microbiome
    # Project (HMP_690.tsv and HMPII_689.tsv), or of soils (Soil.tsv and SoilII.tsv).
    # These comparative tables are provided as part of the metaBIT package.

    merge:  # An indented list of previously obtained tables of taxon relative abundances
      - Path/to/previous/all_taxa.tsv 

    #doDiv: # Uncomment to compute diversity indices.
    doBarplot:
      --order: euclidean  # Reorder samples according to the Euclidean distance between samples.
    doClust:
      --ncores: 4  # number of cores for parallelization.
    doPcoa:
      --makefile: pcoa_symbols_and_color.R  # A file that controls point and legend formats.
    doHeatmap:


Lefse:
  # Disabled by default.
  # A typical use is to run the pipeline first without running LEfSe, and then run
  # LEfSe using groups established by the PCoA/hierarchical clustering (Statax section).
  run: no

  # Optionally add comparative samples. Must be a list.
  merge:
    - Comparative_all_taxa.tsv

  # Change output folder name (lefse): 
  outdir: lefse


  format: pdf  # Format for output plots. Default = pdf. Other values: png, svg.
  format_input:
    # options for lefse/format_input.py.
    # None is required (metaBIT will set -c -s and -u automatically).
    -o: 1000000  # Rescales abundances to sum to 1,000,000. Default = 1,000,000.
  run_lefse:
    # Any option for lefse/run_lefse.py (see https://bitbucket.org/nsegata/lefse/src).
  plot_res: # Any option for plot_res.py (see https://bitbucket.org/nsegata/lefse/src).
  plot_cladogram: # Any option for plot_cladogram.py (see https://bitbucket.org/nsegata/lefse/src).
  plot_features: # Any option for plot_features.py (see https://bitbucket.org/nsegata/lefse/src).

  # Groups including a minimum of 2 samples each MUST be defined a priori for running LEfSe analyses.
  Groups:
    MyGroup1: # e.g. Oceanic water
      MySubGroup1-1: # Optional level, required for all Groups if declared in one; e.g. surface.
        # Each element of the list must be a legal column name in the table of relative
        # abundances. If you are using the default table, column names are:
        # {samplename}_{libraryname}
        # If the sample has been pooled, the column name is just the sample name
        - Sample1_Lib1
        - Sample1_Lib2
      MySubGroup1-2: # optional level, required for all Groups if declared in one; e.g. mesopelagic.
        - Sample2_Lib1
        - Sample2_Lib2
    MyGroup2: # e.g. Soil
      MySubGroup2-1: # optional level, required for all Groups if declared in one; e.g. tropical.
        - Sample3_Lib1
        - Sample3_Lib2
      MySubGroup2-2: # optional level, required for all Groups if declared in one; e.g. polar.
        - Sample4_Lib1
        - Sample4_Lib2

Updated