1. Davide Cittaro
  2. utils

Source

utils / makeScripts.sh

#!/bin/bash

# Look for binaries we need, exit if not found

#BWA=`which bwa`
NOVO=/data_n2/hmw244/Software/novocraftV2.07.00/novoalign
SAMTOOLS=`which samtools`
NOVO_OPTS='-i 200,50'
INDEX_DIR='/data/indexes'

[[ -z ${NOVO} ]] && exit 1
[[ -z ${SAMTOOLS} ]] && exit 1

# Use the SampleSheet.csv to create the script and align
# SampleSheets has a header we want to skip

nlines=(`wc -l SampleSheet.csv`)
tail -n $(( ${nlines[0]} - 1 )) SampleSheet.csv | while read line
do
  # There may be reasons to skip this line... i.e. when there's no indexed genome 
  Skip=1

  # Split the SampleSheet line using commas, store values we need (ID, Lane, Genome and Index)
  IFS=","
  lineArray=($line)
  Lane=`printf "%03d" ${lineArray[1]}`
  ID=${lineArray[2]}
  Genome=${lineArray[3]}
  if [ -z ${lineArray[4]} ]
  then 
    # If there's no index, set it to NoIndex, just like Illumina does
    Index="NoIndex"
  else 
    Index=${lineArray[4]}
  fi
  
  # Create the Experiment name, this is also the name Illumina gives to files
  expName=${ID}_${Index}_L${Lane}
  
  # Check how many chunks we have, this will be the number of SGE array tasks
  
  ntask=`ls ${ID}_${Index}_L${Lane}_R1_*.fastq.gz | wc -l`

  # Also check if this is paired end sequencing. This changes the bwa line
  isPE=0
  if [ -f ${ID}_${Index}_L${Lane}_R2_001.fastq.gz ]
  then
    isPE=1
  fi  
  
  # Ok, what if we don't have the genome?
#  [[ -d ${INDEX_DIR}/bwa/${Genome} ]] && Skip=0
  [[ -d ${INDEX_DIR}/novo/${Genome} ]] && Skip=0
  if [ $Skip -eq 1 ]
  then
    echo "Missing ${Genome}, you should correct the file by hand (align_${expName}) and qsub it"
    GFile="FIXME"
  else
    # If there's the genome, look for the proper index (assuming there's one index per directory ^__^ )
#    GFile=`ls ${INDEX_DIR}/bwa/${Genome}/*.bwt`
    GFile=`ls ${INDEX_DIR}/novo/${Genome}/*.nix`
#    GFile=${GFile%%.bwt}
    echo "Using ${GFile}"
  fi  

 
unset IFS 
# generate the bwa script
# 
cat > align_${expName}.sh << ENDOFSCRIPT
#$ -S /bin/bash
#$ -cwd
#$ -N align_${ID}_L${Lane}_$$
#$ -t 1-${ntask}
#$ -tc 12
#$ -V


# This line should be commented if running in grid engine. If not, it passes as argument a progressive number
SGE_TASK_ID=\$1
# chunks in file names can be mapped to SGE array task IDs
chunk=\`printf "%03d" \$SGE_TASK_ID\`

ENDOFSCRIPT

# Here we define the proper command, it's a miracle of subshells :-)
  if [ $isPE == 1 ]
  then
    #alignString="${BWA} sampe ${GFile} <(${BWA} aln -t 2 ${GFile} ${expName}_R1_\${chunk}.fastq.gz) <(${BWA} aln -t 2 ${GFile} ${expName}_R2_\${chunk}.fastq.gz) ${expName}_R1_\${chunk}.fastq.gz ${expName}_R2_\${chunk}.fastq.gz"
    alignString="${NOVO} -d ${GFile} -f ${expName}_R1_\${chunk}.fastq.gz ${expName}_R2_\${chunk}.fastq.gz ${NOVO_OPTS} -o SAM"
  else
    #alignString="${BWA} aln -t 2 ${GFile} ${expName}_R1_\${chunk}.fastq.gz | ${BWA} samse ${GFile} - ${expName}_R1_\${chunk}.fastq.gz"
    alignString="${NOVO} -d ${GFile} -f ${expName}_R1_\${chunk}.fastq.gz ${NOVO_OPTS} -o SAM"
  fi
alignString="${alignString} |  ${SAMTOOLS} view -Su - | ${SAMTOOLS} sort - ${expName}_\${chunk}"  

# Add the command to the bwa script
echo $alignString >> align_${expName}.sh

# generate the samtools script, this will merge the bams into one

cat > merge_${expName}.sh << ENDOFSCRIPT
#$ -S /bin/bash
#$ -cwd
#$ -hold_jid align_${ID}_L${Lane}_$$
#$ -N merge_${ID}_L${Lane}
#$ -V

# Merge chunks
${SAMTOOLS} merge -f ${expName}.bam ${expName}_*.bam

# Remove chunks
/bin/mv ${expName}.bam ${expName}.bam.lock
/bin/rm -f ${expName}_*.bam
/bin/mv ${expName}.bam.lock ${expName}.bam

# Index the final bam
${SAMTOOLS} index ${expName}.bam

ENDOFSCRIPT

# Everything alright? Qsub!
  if [ $Skip -eq 0 ]
  then
    qsub align_${expName}.sh
    qsub merge_${expName}.sh
  else
    echo "Remember to fix and resubmit align_${expName}.sh"
  fi

done