Wiki

Clone wiki

sequencing / Applications_Map_to_SureSelect

library(GenomicRanges)
library(gtools)

##################
###   Inputs   ###
##################
liftOver.folder <- var1
extraction.file <- var2

#####################
###   Functions   ###
#####################
read.data <- function(file, col.names){
  # Read data
  data.table <- read.table(file, header = F, stringsAsFactors = F, sep = "\t" )

  # Add colnames and remove non standard chromosomes
  colnames(data.table) <- col.names
  standard.chrm <- paste0("chr", c(1:22, "X", "Y", "M"))
  data.table <- data.table[data.table$chr %in% standard.chrm, ]

  # Sort data by chr and start
  data.table$chr <- factor(data.table$chr, standard.chrm, ordered=TRUE)
  data.out <- data.table[do.call(order, data.table[, c("chr", "start")]),]
  return(data.out)
}

methylConvert <- function(meth, key){
  meth$end <- meth$start
  meth$nC <- meth$cov * meth$meth
  meth$nT <- meth$cov - meth$nC

  meth <- meth[,c("chr", "start", "end", "strand", "cov", "meth", "nC", "nT")]

  tot <- 5302795
  stats <- data.frame(key   = key,
                      nCpGs = nrow(meth),
                      cov.3 = sum(meth$cov >=3),
                      cov.5 = sum(meth$cov >=5),
                      cov.10 = sum(meth$cov >=10),
                      cov.25 = sum(meth$cov >=25),
                      cov.30 = sum(meth$cov >=30),
                      perc.nCpGs = (nrow(meth)/tot)*100,
                      perc.cov.3 = (sum(meth$cov >=3)/tot)*100,
                      perc.cov.5 = (sum(meth$cov >=5)/tot)*100,
                      perc.cov.10 = (sum(meth$cov >=10)/tot)*100,
                      perc.cov.25 = (sum(meth$cov >=25)/tot)*100,
                      perc.cov.30 = (sum(meth$cov >=30)/tot)*100,
                      stringsAsFactors = F)
  return(stats)
}

##################
###   Script   ###
##################
setwd(document.dir)

# Get SureSelect target data and convert them into a GRanges object
cat("Get SureSelect target data and convert them into a GRanges object\n")
sure.select.table <- read.data(file.path(liftOver.folder, "output.bed"), c("chr", "start", "end", "id"))
sure.select.gr <- with(sure.select.table, GRanges(chr, IRanges(start = start, end = end, names = id)))

# Get extraced CpGs
cat("Get extraced CpGs\n")
extracted.cpg.files <- read.table(extraction.file, header = T, stringsAsFactors = F, sep = "\t")

# Initialize stats matrix, bed matrix and mapping list
cat("Initialize stats matrix, bed matrix and mapping list\n")
extraction.stats <- data.frame(key   = character(0),
                               nCpGs = integer(0),
                               cov.3 = integer(0),
                               cov.5 = integer(0),
                               cov.10 = integer(0),
                               cov.25 = integer(0),
                               cov.30 = integer(0),
                               perc.nCpGs = double(0),
                               perc.cov.3 = double(0),
                               perc.cov.5 = double(0),
                               perc.cov.10 = double(0),
                               perc.cov.25 = double(0),
                               perc.cov.30 = double(0),
                               stringsAsFactors = F) 
cpg.array.out <- list()

bed.files <- data.frame(Key = character(nrow(extracted.cpg.files)),
                        File = character(nrow(extracted.cpg.files)),
                        stringsAsFactors = F)

cat("Mapping CpGs to SureSelect targets")

bed.dir <- file.path(document.dir, "BED_Files")
if (!dir.exists(bed.dir)){
  dir.create(bed.dir)
}

for(i in 1:nrow(extracted.cpg.files)){
  # Get dataset info
  file <- as.character(extracted.cpg.files$File[i])
  key <- as.character(extracted.cpg.files$Key[i])

  cat("\n- Mapping sample", key)

  # Get extracted CpG methylation data and convert them into a GRanges object
  data.table <- read.data(file, c("chr", "start", "strand", "cov", "meth"))
  ids <- paste0(data.table$chr, ":", data.table$start, "-", data.table$start)
  duplicates <- which(duplicated(ids))
  if (length(duplicates) > 0){
    data.table <- data.table[-duplicates,]
    ids <- ids[-duplicates]
  }
  rownames(data.table) <- ids
  data.gr <- with(data.table, GRanges(chr, IRanges(start = start, end = start), names = rownames(data.table),
                                      strand = strand, cov = cov, meth = meth))

  # Map CpG to SureSelect targets
  mapped.to.target.gr <- intersect(data.gr, sure.select.gr, ignore.strand = T)
  mapped.to.target <- as.data.frame(mapped.to.target.gr)

  mapped.ids <- unlist(apply(mapped.to.target, 1, function(x) paste0(x[1], ":", x[2]:x[3], "-", x[2]:x[3])))

  # Filter mapped data and add to list
  cpg.table.out <- data.table[mapped.ids,]
  rownames(cpg.table.out) <- NULL
  cpg.array.out[[key]] <- cpg.table.out

  cpg.table.bed <- cpg.table.out[,c("chr", "start", "start", "meth")]
  cpg.table.bed$meth <- cpg.table.bed$meth*100
  cpg.table.bed$nCs <- round(cpg.table.out$meth * cpg.table.out$cov)
  cpg.table.bed$nTs <- cpg.table.out$cov - cpg.table.bed$nCs

  bed.files$Key[i] <- key
  bed.files$File[i] <- paste0(key, ".bed")
  write.table(cpg.table.bed, file.path(bed.dir, bed.files$File[i]), sep = "\t",
              row.names = F, col.names = F, quote = F)

  # Get coverage statistics
  stats.before <- methylConvert(data.table, paste0(key, "_before_mapping"))
  stats.after <- methylConvert(cpg.table.out, paste0(key, "_after_mapping"))
  extraction.stats <- rbind(extraction.stats, stats.before, stats.after)
}

# Get patients information
cat("\nGet patients' information\n")
info <- strsplit(bed.files$Key, split="_")

bed.files$TumorStage <- "Primary"
bed.files$TumorStage[as.logical(unlist(lapply(info, function(x) length(grep("i", x[2])>0))))] <- "Interval"

bed.files$Organ <- unlist(lapply(info, function(x) x[3]))

bed.files$TumorStage[grep("EOC", bed.files$Key)] <- "Epithelial_OvCa"
bed.files$TumorStage[grep("FTE", bed.files$Key)] <- "Fallopian_Tube"
bed.files$TumorStage[grep("OSE", bed.files$Key)] <- "Ovary"

bed.files$Organ[grep("EOC", bed.files$Key)] <- "ov"
bed.files$Organ[grep("EOC59", bed.files$Key)] <- "per"
bed.files$Organ[grep("FTE", bed.files$Key)] <- "ft"
bed.files$Organ[grep("OSE", bed.files$Key)] <- "ov"

# Write outputs
cat("Write outputs")
array.out <- cpg.array.out
table.out <- extraction.stats
write.table(bed.files, file.path(document.dir, "OvCa_BED_Files.csv"), sep = ",",
            row.names = F, col.names = T, quote = F)

Back to Methylation Preprocessing

Updated