Wiki
Clone wikisequencing / Applications_Map_to_SureSelect
library(GenomicRanges) library(gtools) ################## ### Inputs ### ################## liftOver.folder <- var1 extraction.file <- var2 ##################### ### Functions ### ##################### read.data <- function(file, col.names){ # Read data data.table <- read.table(file, header = F, stringsAsFactors = F, sep = "\t" ) # Add colnames and remove non standard chromosomes colnames(data.table) <- col.names standard.chrm <- paste0("chr", c(1:22, "X", "Y", "M")) data.table <- data.table[data.table$chr %in% standard.chrm, ] # Sort data by chr and start data.table$chr <- factor(data.table$chr, standard.chrm, ordered=TRUE) data.out <- data.table[do.call(order, data.table[, c("chr", "start")]),] return(data.out) } methylConvert <- function(meth, key){ meth$end <- meth$start meth$nC <- meth$cov * meth$meth meth$nT <- meth$cov - meth$nC meth <- meth[,c("chr", "start", "end", "strand", "cov", "meth", "nC", "nT")] tot <- 5302795 stats <- data.frame(key = key, nCpGs = nrow(meth), cov.3 = sum(meth$cov >=3), cov.5 = sum(meth$cov >=5), cov.10 = sum(meth$cov >=10), cov.25 = sum(meth$cov >=25), cov.30 = sum(meth$cov >=30), perc.nCpGs = (nrow(meth)/tot)*100, perc.cov.3 = (sum(meth$cov >=3)/tot)*100, perc.cov.5 = (sum(meth$cov >=5)/tot)*100, perc.cov.10 = (sum(meth$cov >=10)/tot)*100, perc.cov.25 = (sum(meth$cov >=25)/tot)*100, perc.cov.30 = (sum(meth$cov >=30)/tot)*100, stringsAsFactors = F) return(stats) } ################## ### Script ### ################## setwd(document.dir) # Get SureSelect target data and convert them into a GRanges object cat("Get SureSelect target data and convert them into a GRanges object\n") sure.select.table <- read.data(file.path(liftOver.folder, "output.bed"), c("chr", "start", "end", "id")) sure.select.gr <- with(sure.select.table, GRanges(chr, IRanges(start = start, end = end, names = id))) # Get extraced CpGs cat("Get extraced CpGs\n") extracted.cpg.files <- read.table(extraction.file, header = T, stringsAsFactors = F, sep = "\t") # Initialize stats matrix, bed matrix and mapping list cat("Initialize stats matrix, bed matrix and mapping list\n") extraction.stats <- data.frame(key = character(0), nCpGs = integer(0), cov.3 = integer(0), cov.5 = integer(0), cov.10 = integer(0), cov.25 = integer(0), cov.30 = integer(0), perc.nCpGs = double(0), perc.cov.3 = double(0), perc.cov.5 = double(0), perc.cov.10 = double(0), perc.cov.25 = double(0), perc.cov.30 = double(0), stringsAsFactors = F) cpg.array.out <- list() bed.files <- data.frame(Key = character(nrow(extracted.cpg.files)), File = character(nrow(extracted.cpg.files)), stringsAsFactors = F) cat("Mapping CpGs to SureSelect targets") bed.dir <- file.path(document.dir, "BED_Files") if (!dir.exists(bed.dir)){ dir.create(bed.dir) } for(i in 1:nrow(extracted.cpg.files)){ # Get dataset info file <- as.character(extracted.cpg.files$File[i]) key <- as.character(extracted.cpg.files$Key[i]) cat("\n- Mapping sample", key) # Get extracted CpG methylation data and convert them into a GRanges object data.table <- read.data(file, c("chr", "start", "strand", "cov", "meth")) ids <- paste0(data.table$chr, ":", data.table$start, "-", data.table$start) duplicates <- which(duplicated(ids)) if (length(duplicates) > 0){ data.table <- data.table[-duplicates,] ids <- ids[-duplicates] } rownames(data.table) <- ids data.gr <- with(data.table, GRanges(chr, IRanges(start = start, end = start), names = rownames(data.table), strand = strand, cov = cov, meth = meth)) # Map CpG to SureSelect targets mapped.to.target.gr <- intersect(data.gr, sure.select.gr, ignore.strand = T) mapped.to.target <- as.data.frame(mapped.to.target.gr) mapped.ids <- unlist(apply(mapped.to.target, 1, function(x) paste0(x[1], ":", x[2]:x[3], "-", x[2]:x[3]))) # Filter mapped data and add to list cpg.table.out <- data.table[mapped.ids,] rownames(cpg.table.out) <- NULL cpg.array.out[[key]] <- cpg.table.out cpg.table.bed <- cpg.table.out[,c("chr", "start", "start", "meth")] cpg.table.bed$meth <- cpg.table.bed$meth*100 cpg.table.bed$nCs <- round(cpg.table.out$meth * cpg.table.out$cov) cpg.table.bed$nTs <- cpg.table.out$cov - cpg.table.bed$nCs bed.files$Key[i] <- key bed.files$File[i] <- paste0(key, ".bed") write.table(cpg.table.bed, file.path(bed.dir, bed.files$File[i]), sep = "\t", row.names = F, col.names = F, quote = F) # Get coverage statistics stats.before <- methylConvert(data.table, paste0(key, "_before_mapping")) stats.after <- methylConvert(cpg.table.out, paste0(key, "_after_mapping")) extraction.stats <- rbind(extraction.stats, stats.before, stats.after) } # Get patients information cat("\nGet patients' information\n") info <- strsplit(bed.files$Key, split="_") bed.files$TumorStage <- "Primary" bed.files$TumorStage[as.logical(unlist(lapply(info, function(x) length(grep("i", x[2])>0))))] <- "Interval" bed.files$Organ <- unlist(lapply(info, function(x) x[3])) bed.files$TumorStage[grep("EOC", bed.files$Key)] <- "Epithelial_OvCa" bed.files$TumorStage[grep("FTE", bed.files$Key)] <- "Fallopian_Tube" bed.files$TumorStage[grep("OSE", bed.files$Key)] <- "Ovary" bed.files$Organ[grep("EOC", bed.files$Key)] <- "ov" bed.files$Organ[grep("EOC59", bed.files$Key)] <- "per" bed.files$Organ[grep("FTE", bed.files$Key)] <- "ft" bed.files$Organ[grep("OSE", bed.files$Key)] <- "ov" # Write outputs cat("Write outputs") array.out <- cpg.array.out table.out <- extraction.stats write.table(bed.files, file.path(document.dir, "OvCa_BED_Files.csv"), sep = ",", row.names = F, col.names = T, quote = F)
Updated