#!/usr/bin/env ruby

=begin
wtss_coverage_bias.rb
30 June 2010

INTENT
1. Read a gene table from UCSC.
2. Read in a 3-column pileup subset file that's been awk-ed from a SAMtools pileup (WTSS).
3. For each transcript on a chromosome, accumulate the pileup profile for the exons. Remember the transcript length.
4. Generate and write out normalized (0.0 to 1.0) profile for the transcript.
geneID  transcriptID nbrOfExons transcriptLen meanCoverage profileList...
5. Deal with +/- strand. Initially just use +-strand transcripts.
6. Read in the file and plot coverage quantile profiles.

May want to generate a union exon set...and work with that.

[grobertson@xhost09 MM0472_WTSS_liver_adult]$ head MM0472.merged_7_lanes_withJunctionsOnGenome_dupsFlagged.pileup
chrom   coord   seq     ht
1       3041046 A       1       ^8.     B
1       3041047 C       1       A       B
1       3041048 A       1       .       B
1       3041049 C       1       .       @
...

RefFlat ------------------------------------------------------------------------

Ube1y1	NM_011667	chrY_random	+	43172477	43197989	43174585	43197287	26	43172477,43174585,43174816,43174956,43175214,43175910,43176223,43176908,43179212,43179406,43179663,43179942,43180135,43180526,43182530,43182905,43185023,43185339,43186088,43186268,43187939,43195732,43195944,43196688,43196898,43197151,	43172579,43174702,43174872,43175125,43175349,43176017,43176314,43177041,43179310,43179553,43179840,43180047,43180216,43180682,43182696,43183102,43185088,43185535,43186163,43186461,43188028,43195825,43196136,43196790,43196999,43197989,
CREATE TABLE `refFlat` (
1  `geneName` varchar(255) NOT NULL default '',
2  `name` varchar(255) NOT NULL default '',
3  `chrom` varchar(255) NOT NULL default '',
4  `strand` char(1) NOT NULL default '',
5  `txStart` int(10) unsigned NOT NULL default '0',
6  `txEnd` int(10) unsigned NOT NULL default '0',
7  `cdsStart` int(10) unsigned NOT NULL default '0',
8  `cdsEnd` int(10) unsigned NOT NULL default '0',
9  `exonCount` int(10) unsigned NOT NULL default '0',
10  `exonStarts` longblob NOT NULL,
11 `exonEnds` longblob NOT NULL,
  KEY `chrom` (`chrom`,`txStart`),
  KEY `name` (`name`),
  KEY `geneName` (`geneName`(10))
) ENGINE=MyISAM DEFAULT CHARSET=latin1;


=end

#=====================
# But we should pre-select a chr's worth of data before loading a pileup file
tgt_chr =    "chr1"
tgt_strand = "+"
#=====================


#refflat = "/Users/grobertson/GENEREG/Data/UCSC/mm9/RefSeq/refFlat.20091107.txt"
refflat = "/projects/remc_bigdata/MORGEN/MM0472_WTSS_liver_adult/bias/refFlat.20091107.txt"
begin
   rf_rh = File.open(refflat)
   puts "Opened RefFlat file"
rescue
   raise "Could not open RefFlat at: #{refflat}"
end

#pileup_data_for_a_chr = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/WTSS/Coverage/test.1to4.100k_recs.pileup"
pileup_data_for_a_chr = "/projects/remc_bigdata/MORGEN/MM0472_WTSS_liver_adult/bias/MM0472.merged_7_lanes_withJunctionsOnGenome_dupsFlagged.1to4.chr10.pileup"
#pileup_data_for_a_chr = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/WTSS/MM0472/assembly2/abyss-1.0.13/pileup/MM0472.merged_7_lanes_withJunctionsOnGenome_dupsFlagged.1to4.chr10.pileup"
begin
   p_rh = File.open(pileup_data_for_a_chr)
   puts "Opened pileup file (cols 1 to 4 only)"
rescue
   raise "Could not open pileup_data_for_a_chr at: #{pileup_data_for_a_chr}"
end


exon_coords_aoa = Array.new

################################################################################
def read_refflat_data( rf_rh, tgt_chr, tgt_strand )

   transcript_ids_and_lens_h = Hash.new
   exon_start_coords_a = Array.new
   exon_end_coords_a =   Array.new
   exon_start_recs_hoa = Hash.new
   exon_end_recs_hoa =   Hash.new

   n_recs = 0
   n_tgt_recs = 0

   begin
      while line = rf_rh.readline.chomp

         if line =~ /^(\S+)\t(\S+)\t(chr\S+)\t([+|-])\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\S+)\t(\S+)/ then

            symbol =        $1
            transcript_id = $2
            chrom =         $3
            strand =        $4
            txStart =       $5.to_i
            txEnd =         $6.to_i
            cdsEnd =        $7.to_i
            cdsEnd =        $8.to_i
            exonCount =     $9.to_i
            exonStartList = $10
            exonEndList =   $11

            n_recs += 1; if (n_recs % 5000 == 0) then puts " #{n_recs}" end

            if (chrom == tgt_chr and strand == tgt_strand) then

               n_tgt_recs += 1

               exon_starts_a = exonStartList.split(",")
               exon_ends_a =   exonEndList.split(",")
               # Check!
               if exon_starts_a.length != exonCount or exon_ends_a.length != exonCount then raise "line 98: wrong exon count!" end

               (0..exonCount-1).each do |i|
                  exon_start_coords_a[i] = exon_starts_a[i].to_i
                  exon_end_coords_a[i] = exon_ends_a[i].to_i
               end

               # Total the exon lengths
               total_exon_len = 0
               (0..exonCount-1).each do |i|
                  this_exons_len = exon_end_coords_a[i] - exon_start_coords_a[i]
                  total_exon_len =+ this_exons_len
               end

               transcript_ids_and_lens_h[transcript_id] = total_exon_len
               #if transcript_ids_a.length == 0 then transcript_ids_a = [transcript_id]
               #else transcript_ids_a << transcript_id
               #end

               exon_start_recs_hoa[transcript_id] = [ symbol, total_exon_len, exonCount, exon_start_coords_a ]
               exon_end_recs_hoa[transcript_id] =   [ symbol, total_exon_len, exonCount, exon_end_coords_a ]

            end

         else  #Skip this record: wrong chr or strand

         end

      end
   rescue EOFError

      rf_rh.close

      #transcript_ids_and_lens_h.sort!

      puts " read #{n_recs} transcript records"
      puts " retained #{transcript_ids_and_lens_h.length} transcript IDs on #{tgt_chr}"

      return transcript_ids_and_lens_h, exon_start_recs_hoa, exon_end_recs_hoa

   end

end

################################################################################
=begin
Test file
1       3041046 A       1
1       3041047 C       1
1       3041048 A       1
1       3041049 C       1
1       3041050 A       1
1       3041051 C       1
1       3041052 A       1
1       3041053 C       1
1       3041054 A       1
1       3041055 C       1

We want a hash:
blah_h[3041046] = 1

=end
def get_pileup_data_for_a_chr( p_rh )

   pileup_coord_ht_h = Hash.new
   recs_read = 0

   begin

      while line = p_rh.readline.chomp

         if line =~ /^(\S+)\s+(\d+)\s+(\S)\s+(\d+)/ then

            chrom =  $1
            coord =  $2.to_i
            base =   $3
            height = $4.to_i

            pileup_coord_ht_h[coord] = height

            recs_read += 1; if (recs_read % 500000 == 0) then puts " #{recs_read}" end
         end
      end

   rescue EOFError

      p_rh.close
      puts " read #{recs_read} pileup records"
      return pileup_coord_ht_h
   end
end

################################################################################

###
puts "1. Read the transcript information from a UCSC table"
#----------------------------------------------------------------------------------------------------------
#transcript_ids_and_lens_h, exon_start_recs_hoa, exon_end_recs_hoa = read_refflat_data( rf_rh, tgt_chr, tgt_strand )
#----------------------------------------------------------------------------------------------------------

###
# We should create a hash in which the key is the coordinate and the value is the pileup height
puts "\n2. Read in the pileup file."
#------------------------------------------------------
pileup_coord_ht_h = get_pileup_data_for_a_chr( p_rh )
#------------------------------------------------------

###
puts "\n3.Process the transcript exon information and the pileup file"

transcript_ids_a = transcript_ids_and_lens_h.keys.sort

transcript_ids_a.each do |id|

   xy_aoa = Array.new

   start_recs_for_this_id_a = exon_start_recs_hoa[id]
   end_recs_for_this_id_a =   exon_end_recs_hoa[id]

   gene_symbol =             start_recs_for_this_id_a[0]
   nbr_exons_in_transcript = start_recs_for_this_id_a[1]

   exon_start_coords_for_an_id_a = start_recs_for_this_id_a[2]
   exon_end_coords_for_an_id_a =   end_recs_for_this_id_a[2]

   # For each exon, get the pileup profile
   # Then combine these into a single profile in absolute coords
   # Then combine them into a single profile in relative coords
   # Then convert into a 100-bin standard profile

=begin
1	4883498	G	121
1	4883499	A	121
1	4883500	A	121
1	4883501	A	121
1	4883502	T	121
1	4883503	G	121
1	4883504	G	115
1	4883505	C	110
1	4883506	T	106
1	4883507	A	101
=end
   exon_start = 4883498
   exon_end =   4883507

   (exon_start..exon_end).each do |coord|
      ht = pileup_coord_ht_h[coord].to_i
      xy_aoa << [coord,ht]
   end

   puts "loaded xy... for #{id}"

end