# 18 Dec 2010, 17h00 # Code from Mark Robinson, 13, 16 Dec 2010 # source("/projects/remc_bigdata/Karsan/R/20101218/edgeR_TSS-rgns_DE-H4ac.20101218.R") #install.packages("/home/grobertson/linux_x86_64/R/Rsamtools_1.2.2.tar.gz",type="source") #install.packages("/home/grobertson/linux_x86_64/R/IRanges_1.8.7.tar.gz",type="source") ############################################################################## options(stringsAsFactors=FALSE) library(rtracklayer) session <- browserSession("UCSC") genome(session) <- "hg18" #trackNames(session) ## list the track names q <- ucscTableQuery(session, "knownGene") print(date()) knownGene <- getTable(q) print(date()) # merge TSSs with identical starts knownGene$position <- ifelse(knownGene$strand=="+", knownGene$txStart, knownGene$txEnd) key <- paste(knownGene$chrom, knownGene$position, sep=":") ukey <- unique(key) m <- match(ukey, key) mkg <- knownGene[m,c("chrom","strand","txStart","txEnd","name","position")] # mkg = merged known genes # create merged IDs rownames(mkg) <- ukey ids <- split(knownGene$name, key) collapsedids <- sapply(ids, paste, collapse=",") mkg$newID <- "" mkg[names(collapsedids),"newID"] <- collapsedids # merge nearby TSSs tssTol <- 500 # if distance b/w TSSs is less than this, they are merged splitTol <- 250000 lockey <- paste(mkg$chrom, mkg$strand, sep=":") ind <- seq_len(nrow(mkg)) inds <- split(ind, lockey) clustno <- lapply(inds, FUN=function(u) { w <- diff(mkg$position[u]) > splitTol z <- cumsum(c(0,w)) s <- split(u, z) cat(".") clusts <- lapply(s, FUN=function(uu) { if(length(uu)==1) return(1) d <- dist(mkg$position[uu]) h <- hclust(d,"ave") cutree(h,h=tssTol) }) paste( rep(names(s), sapply(s,length)), unlist(clusts, use.names=FALSE), sep=".") }) clustno <- unsplit( clustno, lockey ) clustkey <- paste( mkg$chrom, mkg$strand, clustno, sep=":" ) uckey <- unique(clustkey) mc <- match(uckey, clustkey) mkg1 <- mkg[mc,] rownames(mkg1) <- uckey ids <- split(mkg$newID, clustkey) collapsedids <- sapply(ids, paste, collapse=";") mkg1$newID <- "" mkg1[names(collapsedids),"newID"] <- collapsedids rownames(mkg1) <- NULL anno <- data.frame(chr=mkg1$chrom, strand=mkg1$strand, start=mkg1$txStart, end=mkg1$txEnd, name=mkg1$name, allIDs=mkg1$newID) cat("\n",date(),"\n") ############################################################################## print("read in BAM files") library(Rsamtools) library(Repitools) library(edgeR) library(BSgenome.Hsapiens.UCSC.hg18) print("some parameter settings") # need to add a failed-chastity filter and a MAPQ filter #p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) # 16 Dec 2010, filter by MAPQ p <- ScanBamParam(what=c("rname", "strand", "pos", "mapq"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") fragSize <- 200 ############################################################################## # 16 Dec 2010 #readLen <- c(50,75,50,50) # vector to match filenames #filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", # "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", # "/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam", # "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam") ############################################################################## # 18 Dec 2010 - just 'lane-2' data readLen <- c(50,50) # vector to match filenames filenames <- c("/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam", "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam") print("now read in .bam files") ############################################################################## # 16 Dec 2010: old way without filtering by MAPQ #gr <- mapply(FUN=function(u,v) { # sb <- scanBam(u, param=p)[[1]] # GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) # },filenames,readLen) #names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2") #names(gr) <- c("HS1238_kd_2","HS1235_ctl_2") ############################################################################## # 16 Dec, Mark Robinson: add MAPQ filter here # 18 Dec 2010 - use for lane-2 data gr <- mapply(FUN=function(u,v) { sb <- scanBam(u, param=p)[[1]] w <- sb$mapq > 10 # SET CUTOFF HERE cat(u, ": Read", length(sb$rname), "mapped non-dup. reads,", round(mean(w)*100,2), "percent with MAPQ>10\n") GRanges(seqnames=paste("chr",sb$rname[w],sep=""), ranges=IRanges(start=sb$pos[w],width=v), strand=sb$strand[w]) },filenames,readLen) #names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2") names(gr) <- c("HS1238_kd_2","HS1235_ctl_2") ############################################################################## grl <- GRangesList(gr) print("TSS regions: get counts") #cat("bpUp=1500, bpDown=1000", "\n") #counts <- annotationCounts(grl, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE) #cat("bpUp=2000, bpDown=1000", "\n") #counts <- annotationCounts(grl, anno, bpUp=2000, bpDown=1000, seqLen=fragSize, verbose=TRUE) cat("bpUp=2500, bpDown=1000", "\n") counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1000, seqLen=fragSize, verbose=TRUE) #cat("bpUp=2500, bpDown=1500", "\n") #counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1500, seqLen=fragSize, verbose=TRUE) colSums(counts) #HS1238_kd HS1235_ctl # 3070716 3288149 k <- rowSums(counts) > 10 f <- calcNormFactors(counts[k,]) ############################################################################## #print("estimate common dispersion as 'dhack'") #dhack <- d <- DGEList(counts=counts[k,], # group=colnames(counts), # lib.size=colSums(counts[k,])*f, # genes=anno[k,1:3]) # #genes=as.data.frame(windows)[k,1:3]) #dhack$samples$group <- factor(c("A","A")) #dhack <- estimateCommonDisp(dhack) #d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning #d$common.dispersion <- dhack$common.dispersion #cat("TSSrgns: dhack$common.dispersion=",dhack$common.dispersion,"\n") ############################################################################## # Old way, without 'dhack', for a single run pair # 18 Dec 2010 - resusing this for lane-2 data d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*f, genes=anno[k,1:3]) d <- estimateCommonDisp(d) cat("d$common.dispersion=",d$common.dispersion,"\n") print("exactTest()") de <- exactTest(d,pair=c("HS1235_ctl_2","HS1238_kd_2")) topTags(de) xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") ) print("write.table, all records, regardless of p-val") #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101218/HS1238_vs_1235.lane-2.mapq10-noDups.knownGene.merge-500.2p5kb-TSS-1p5kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE) write.table(xx, "/projects/remc_bigdata/Karsan/R/20101218/HS1238_vs_1235.lane-2.mapq10-noDups.knownGene.merge-500.2p5kb-TSS-1kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101218/HS1238_vs_1235.lane-2.mapq10-noDups.knownGene.merge-500.1p5kb-TSS-1kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101218/HS1238_vs_1235.lane-2.mapq10-noDups.knownGene.merge-500.2kb-TSS-1kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE) print(date()) cat("save.image()\n") #save.image("Robjects.2p5kb-TSS-1p5kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file save.image("Robjects.2p5kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file #save.image("Robjects.2kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file #save.image("Robjects.1.5kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file ############################################################################## # 4 lanes, as replicates, 16 Dec 2010 #cat("\n","process as replicates","\n") #d.rep <- DGEList(counts=counts[k,], # group=c("HS1238_kd","HS1235_ctl","HS1238_kd","HS1235_ctl"), # lib.size=colSums(counts[k,])*f, # genes=anno[k,1:3]) #d.rep <- estimateCommonDisp(d.rep) #cat("d.rep$common.dispersion=",d.rep$common.dispersion,"\n") # this can be estimated now, but should be v. small #print("exactTest()") #de.rep <- exactTest(d.rep,pair=c("HS1235_ctl","HS1238_kd")) #topTags(de.rep) #xx <- cbind(anno[k,-5], counts[k,], id=rownames(de.rep$table), de.rep$table,adjp=p.adjust(de.rep$table$p.value, method="BH") ) #print("write.table, all records, regardless of p-val") #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101216/HS1238_vs_1235.4-Lanes-asReplicates.mapq10-noDups.knownGeneTSS-2.0to1kb.edgeR.all.20101216.txt", sep="\t", row.names=FALSE, quote=FALSE) ############################################################################## ### 4 lanes, separate, 16 Dec 2010 ########################################## #cat("\n","process as separate old and new run pairs","\n") #d.sep <- DGEList(counts=counts[k,], # group=c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2"), # lib.size=colSums(counts[k,])*f, # genes=anno[k,1:3]) #d.sep <- estimateCommonDisp(d.sep) # this will give warning #de.sep.old <- exactTest(d.sep,pair=c("HS1235_ctl","HS1238_kd")) #topTags(de.sep.old) #de.sep.new <- exactTest(d.sep,pair=c("HS1235_ctl_2","HS1238_kd_2")) #topTags(de.sep.new) #xx.sep.old <- cbind(anno[k,-5], counts[k,], id=rownames(de.sep.old$table), de.sep.old$table,adjp=p.adjust(de.sep.old$table$p.value, method="BH") ) #print("write.table, all records, regardless of p-val") #write.table(xx.sep.old, "/projects/remc_bigdata/Karsan/R/20101216/HS1238_vs_1235.4-Lanes-separate-OLD.mapq10-noDups.knownGeneTSS-2.0to1kb.edgeR.all.20101216.txt", sep="\t", row.names=FALSE, quote=FALSE) #xx.sep.new <- cbind(anno[k,-5], counts[k,], id=rownames(de.sep.new$table), de.sep.new$table,adjp=p.adjust(de.sep.new$table$p.value, method="BH") ) #print("write.table, all records, regardless of p-val") #write.table(xx.sep.new, "/projects/remc_bigdata/Karsan/R/20101216/HS1238_vs_1235.4-Lanes-separate-NEW.mapq10-noDups.knownGeneTSS-2.0to1kb.edgeR.all.20101216.txt", sep="\t", row.names=FALSE, quote=FALSE) #print("Now compare the old KD/ctl to the new KD/ctl pair") #print("compare p-values / log-fold changes") #print("Use Z-scores") #z.old <- qnorm(de.sep.old$table$p.value/2)*sign(de.sep.old$table$logFC) #z.new <- qnorm(de.sep.new$table$p.value/2)*sign(de.sep.new$table$logFC) #print("write out PDF") #pdf("/projects/remc_bigdata/Karsan/R/20101216/Zscores_old_new.20101216.pdf", width=1000, height=1000) #pdf("/projects/remc_bigdata/Karsan/R/20101216/Zscores_old_new.mapq10-noDups.20101216.pdf") #plot(z.old, z.new, xlab="H4AC diff. enrich. Z-scores (old)", # ylab="H4AC diff. enrich. Z-scores (new)", pch=19, cex=.4) #dev.off() #print("Alternative for comparing tech replicates: a simple pairs() plot") #p <- function(x, y, ...) #{ # points(x, y, pch = 19, cex = 0.5) # text(8, 15, paste("r =", round(cor(x, y,method="spearman"), 2)), col = "blue", cex = 2) #} #pdf("/projects/remc_bigdata/Karsan/R/20101216/pairs_KD_CTL_old_new.20101216.pdf", width=1000, height=1000) #pdf("/projects/remc_bigdata/Karsan/R/20101216/pairs_KD_CTL_old_new.mapq10-noDups.20101216.pdf") #pairs(log2(counts+1), lower.panel = NULL, upper.panel = p) #dev.off()