Wiki

Clone wiki

sparkseq / Pipelines

Under development

Feature counts with filtering

  import pl.elka.pw.sparkseq.seqAnalysis.SparkSeqAnalysis
  import pl.elka.pw.sparkseq.conversions.SparkSeqConversions
  import pl.elka.pw.sparkseq.util.SparkSeqRegType._
  
  /*initiate a SparkSeqJunctionAnalysis object and attach samples to it*/
  val seqAnalysis = new SparkSeqAnalysis(sc,"/BAM/64MB/derfinder/chrom_Y/M/orbFrontalF1_Y.bam",1,1,1)
  val caseId = Array(11, 32, 3, 42, 43, 47, 53, 58)
  for(i<-caseId)
    seqAnalysis.addBAM(sc,"/BAM/64MB/derfinder/chrom_Y/M/orbFrontalF"+i.toString+"_Y.bam",i,1)
  /*filter out reads with low quality, i.e. PHRED > 30*/
  seqAnalysis.filterMappingQuality(_ > 30)
  /*CIGAR regular expression for filtering out alignment gaps > 5000bp */
  val cigRegex=
    "^([0-9]+[MEQISXDHP]+)+((0*([0-9]{1,3}|[1-4][0-9]{3}))?N?[0-9]+M[0-9]*[EQISXDHP]*){0,3}[0-9]*[MEQISXDHP]*$".r
  seqAnalysis.filterCigarString(cigRegex.pattern.matcher(_).matches)
  /* get feature counts */
  /*exons*/
  val genExonsMapB = sc.broadcast(SparkSeqConversions.BEDFileToHashMap(sc, 
		"/BAM/64MB/aux/Homo_sapiens.GRCh37.74_exons_chr_sort_uniq.bed"))
  val exonCounts = seqAnalysis.getCoverageRegion(genExonsMapB, unionMode=true)
  /*genes*/
  val genesMapB = sc.broadcast(SparkSeqConversions.BEDFileToHashMap(sc, 
		"/BAM/64MB/aux/Homo_sapiens.GRCh37.74_genes_chr_merged_swap.bed"))
  val genesCount = seqAnalysis.getCoverageRegion(genesMapB, unionMode=true)

Junctions counts with filtering

  import pl.elka.pw.sparkseq.seqAnalysis.SparkSeqAnalysis
  import pl.elka.pw.sparkseq.conversions.SparkSeqConversions
  import pl.elka.pw.sparkseq.util.SparkSeqRegType._
  import pl.elka.pw.sparkseq.junctions.SparkSeqJunctionAnalysis
  
  /*initiate a SparkSeqJunctionAnalysis object and attach samples to it*/
  val seqAnalysis = new SparkSeqAnalysis(sc,"/BAM/64MB/derfinder/chrom_Y/M/orbFrontalF1_Y.bam",1,1,1)
  val caseId = Array(11, 32, 3, 42, 43, 47, 53, 58)
  for(i<-caseId)
    seqAnalysis.addBAM(sc,"/BAM/64MB/derfinder/chrom_Y/M/orbFrontalF"+i.toString+"_Y.bam",i,1)
  /*filter out reads with low quality, i.e. PHRED > 30*/
  seqAnalysis.filterMappingQuality(_ > 30)
  /*CIGAR regular expression for filtering out alignment gaps > 5000bp */
  val cigRegex=
    "^([0-9]+[MEQISXDHP]+)+((0*([0-9]{1,3}|[1-4][0-9]{3}))?N?[0-9]+M[0-9]*[EQISXDHP]*){0,3}[0-9]*[MEQISXDHP]*$".r
  seqAnalysis.filterCigarString(cigRegex.pattern.matcher(_).matches)
  /*get junction reads*/
  val juncAnalysis = new SparkSeqJunctionAnalysis(seqAnalysis)
  val juncReadsCount = juncAnalysis.getJuncReadsCounts().filter(j=>(j._2 > 10))
  val juncResults = juncReadsCount
  .map(j=>(SparkSeqConversions.splitSampleID(j._1._1),SparkSeqConversions.splitSampleID(j._1._2),j._2) )
  .map(j=>((j._1._1,SparkSeqConversions.idToCoordinates(j._1._2),
	    SparkSeqConversions.idToCoordinates(j._2._2)),j._3 ))

Updated