# 13 Dec 2010, 18h10
# Code from Mark Robinson
# source("/projects/remc_bigdata/Karsan/R/20101213/process_TSS_with_edgeR_20101213.R", verbose=TRUE)

options(stringsAsFactors=FALSE)

library(rtracklayer)
session <- browserSession("UCSC")
genome(session) <- "hg18"
#trackNames(session) ## list the track names
q <- ucscTableQuery(session, "knownGene")
print(date())
knownGene <- getTable(q)
print(date())


# merge TSSs with identical starts
knownGene$position <- ifelse(knownGene$strand=="+", knownGene$txStart, knownGene$txEnd)
key <- paste(knownGene$chrom, knownGene$position, sep=":")
ukey <- unique(key)
m <- match(ukey, key)
mkg <- knownGene[m,c("chrom","strand","txStart","txEnd","name","position")]  # mkg = merged known genes

# create merged IDs
rownames(mkg) <- ukey
ids <- split(knownGene$name, key)
collapsedids <- sapply(ids, paste, collapse=",")
mkg$newID <- ""
mkg[names(collapsedids),"newID"] <- collapsedids

# merge nearby TSSs 
tssTol <- 500  # if distance b/w TSSs is less than this, they are merged
splitTol <- 250000

lockey <- paste(mkg$chrom, mkg$strand, sep=":")
ind <- seq_len(nrow(mkg))
inds <- split(ind, lockey)

clustno <- lapply(inds, FUN=function(u) {
  w <- diff(mkg$position[u]) > splitTol
  z <- cumsum(c(0,w))
  s <- split(u, z)
  cat(".")
  clusts <- lapply(s, FUN=function(uu) {
     if(length(uu)==1) return(1)
     d <- dist(mkg$position[uu])
     h <- hclust(d,"ave")
     cutree(h,h=tssTol)
  })
  paste( rep(names(s), sapply(s,length)), unlist(clusts, use.names=FALSE), sep=".")
})

clustno <- unsplit( clustno, lockey )
clustkey <- paste( mkg$chrom, mkg$strand, clustno, sep=":" )

uckey <- unique(clustkey)
mc <- match(uckey, clustkey)
mkg1 <- mkg[mc,]

rownames(mkg1) <- uckey
ids <- split(mkg$newID, clustkey)
collapsedids <- sapply(ids, paste, collapse=";")
mkg1$newID <- ""
mkg1[names(collapsedids),"newID"] <- collapsedids

rownames(mkg1) <- NULL

anno <- data.frame(chr=mkg1$chrom, strand=mkg1$strand, start=mkg1$txStart, end=mkg1$txEnd, name=mkg1$name, allIDs=mkg1$newID)

print(date())


##############################################################################
print("read in BAM files")
library(Rsamtools)
library(Repitools)
library(edgeR)
library(BSgenome.Hsapiens.UCSC.hg18)

print("some parameter settings")

# need to add a failed-chastity filter and a MAPQ filter
p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))

chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "")

fragSize <- 200
readLen <- c(50,75)  # vector to match filenames
filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
               "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam")

print("read in .bam files")
gr <- mapply(FUN=function(u,v) {
  sb <- scanBam(u, param=p)[[1]]
  GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand)
  },filenames,readLen)
names(gr) <- c("HS1238_kd","HS1235_ctl")
grl <- GRangesList(gr)
##############################################################################

print("TSS regions: get counts")
cat("bpUp=1500, bpDown=1000", "\n")
counts <- annotationCounts(grl, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE)
#cat("bpUp=2000, bpDown=1000", "\n")
#counts <- annotationCounts(grl, anno, bpUp=2000, bpDown=1000, seqLen=fragSize, verbose=TRUE)
#cat("bpUp=2500, bpDown=1500", "\n")
#counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1500, seqLen=fragSize, verbose=TRUE)

colSums(counts)
#HS1238_kd HS1235_ctl 
#  3070716    3288149
k <- rowSums(counts) > 10 
f <- calcNormFactors(counts[k,])

##########################################################
#print("estimate common dispersion as 'dhack'")
#dhack <- d <- DGEList(counts=counts[k,], 
#	         group=colnames(counts), 
#             lib.size=colSums(counts[k,])*f,
#             genes=anno[k,1:3])
#             #genes=as.data.frame(windows)[k,1:3])
#dhack$samples$group <- factor(c("A","A"))
#dhack <- estimateCommonDisp(dhack)
#d <- estimateCommonDisp(d)  # in my example, no bio replicates, will get warning
#d$common.dispersion <- dhack$common.dispersion
#cat("TSSrgns: dhack$common.dispersion=",dhack$common.dispersion,"\n")
##########################################################
# Old way, without 'dhack'
d <- DGEList(counts=counts[k,], 
            group=colnames(counts), 
            lib.size=colSums(counts[k,])*f,
            genes=anno[k,1:3])
d <- estimateCommonDisp(d)
cat("d$common.dispersion=",d$common.dispersion,"\n")
# END: Old way, without 'dhack'
##########################################################

#print("Write out a smear plot PDF")
#pdf("/projects/remc_bigdata/Karsan/R/20101208/DE-H4ac.knownGene-TSS-1.5to1kb.edgeR.20101208.smearPlot.pdf")
#plotSmear(d)
#grid()
#dev.off()

print("exactTest()")
#de <- exactTest(d,pair=c("HS1238_kd","HS1235_ctl"))
de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd"))
topTags(de)

# Add gene symbol (31 October 2010)
# MR: Most direct, you could just add an extra column to your table:
#Alternatively, when you call the DGEList() constructor, you can specific the 
#'genes' element to carry along the annotation information all the way to the exactTest().  See ?DGEList ...
xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table,adjp=p.adjust(de$table$p.value, method="BH") )
#xxtss <- cbind(anno[k,-5], counts[k,], detss$table,adjp=p.adjust(detss$table$p.value, method="BH") )

print("write.table, all records, regardless of p-val")
write.table(xx, "/projects/remc_bigdata/Karsan/R/20101213/HS1238_vs_1235.knownGeneTSS-1.5to1kb.edgeR.dispersion.all.20101213.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xx, "/projects/remc_bigdata/Karsan/R/20101213/HS1238_vs_1235.knownGeneTSS-2to1kb.edgeR.dispersion.all.20101213.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2.5to1.5kb.edgeR.dispersion.all.20101208.txt", sep="\t", row.names=FALSE, quote=FALSE)

#print("write.table, records with p-val < 0.1")
#w <- xx$p.value < 0.1
#xxx <- xx[w,]
#print("write.table(), only records with pval<0.1")
#write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-1.5to1kb.edgeR.dispersion.pval-LT-0.1.20101213.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2to1kb.edgeR.dispersion.pval-LT-0.1.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2.5to1.5kb.edgeR.dispersion.pval-LT-0.1.20101208.txt", sep="\t", row.names=FALSE, quote=FALSE)

print("done.")