#!/usr/bin/env ruby =begin wtss_coverage_bias.rb 30 June 2010 INTENT 1. Read a gene table from UCSC. 2. Read in a 3-column pileup subset file that's been awk-ed from a SAMtools pileup (WTSS). 3. For each transcript on a chromosome, accumulate the pileup profile for the exons. Remember the transcript length. 4. Generate and write out normalized (0.0 to 1.0) profile for the transcript. geneID transcriptID nbrOfExons transcriptLen meanCoverage profileList... 5. Deal with +/- strand. Initially just use +-strand transcripts. 6. Read in the file and plot coverage quantile profiles. May want to generate a union exon set...and work with that. [grobertson@xhost09 MM0472_WTSS_liver_adult]$ head MM0472.merged_7_lanes_withJunctionsOnGenome_dupsFlagged.pileup chrom coord seq ht 1 3041046 A 1 ^8. B 1 3041047 C 1 A B 1 3041048 A 1 . B 1 3041049 C 1 . @ ... RefFlat ------------------------------------------------------------------------ Ube1y1 NM_011667 chrY_random + 43172477 43197989 43174585 43197287 26 43172477,43174585,43174816,43174956,43175214,43175910,43176223,43176908,43179212,43179406,43179663,43179942,43180135,43180526,43182530,43182905,43185023,43185339,43186088,43186268,43187939,43195732,43195944,43196688,43196898,43197151, 43172579,43174702,43174872,43175125,43175349,43176017,43176314,43177041,43179310,43179553,43179840,43180047,43180216,43180682,43182696,43183102,43185088,43185535,43186163,43186461,43188028,43195825,43196136,43196790,43196999,43197989, CREATE TABLE `refFlat` ( 1 `geneName` varchar(255) NOT NULL default '', 2 `name` varchar(255) NOT NULL default '', 3 `chrom` varchar(255) NOT NULL default '', 4 `strand` char(1) NOT NULL default '', 5 `txStart` int(10) unsigned NOT NULL default '0', 6 `txEnd` int(10) unsigned NOT NULL default '0', 7 `cdsStart` int(10) unsigned NOT NULL default '0', 8 `cdsEnd` int(10) unsigned NOT NULL default '0', 9 `exonCount` int(10) unsigned NOT NULL default '0', 10 `exonStarts` longblob NOT NULL, 11 `exonEnds` longblob NOT NULL, KEY `chrom` (`chrom`,`txStart`), KEY `name` (`name`), KEY `geneName` (`geneName`(10)) ) ENGINE=MyISAM DEFAULT CHARSET=latin1; =end #===================== # But we should pre-select a chr's worth of data before loading a pileup file tgt_chr = "chr1" tgt_strand = "+" #===================== #refflat = "/Users/grobertson/GENEREG/Data/UCSC/mm9/RefSeq/refFlat.20091107.txt" refflat = "/projects/remc_bigdata/MORGEN/MM0472_WTSS_liver_adult/bias/refFlat.20091107.txt" begin rf_rh = File.open(refflat) puts "Opened RefFlat file" rescue raise "Could not open RefFlat at: #{refflat}" end #pileup_data_for_a_chr = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/WTSS/Coverage/test.1to4.100k_recs.pileup" pileup_data_for_a_chr = "/projects/remc_bigdata/MORGEN/MM0472_WTSS_liver_adult/bias/MM0472.merged_7_lanes_withJunctionsOnGenome_dupsFlagged.1to4.chr10.pileup" #pileup_data_for_a_chr = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/WTSS/MM0472/assembly2/abyss-1.0.13/pileup/MM0472.merged_7_lanes_withJunctionsOnGenome_dupsFlagged.1to4.chr10.pileup" begin p_rh = File.open(pileup_data_for_a_chr) puts "Opened pileup file (cols 1 to 4 only)" rescue raise "Could not open pileup_data_for_a_chr at: #{pileup_data_for_a_chr}" end exon_coords_aoa = Array.new ################################################################################ def read_refflat_data( rf_rh, tgt_chr, tgt_strand ) transcript_ids_and_lens_h = Hash.new exon_start_coords_a = Array.new exon_end_coords_a = Array.new exon_start_recs_hoa = Hash.new exon_end_recs_hoa = Hash.new n_recs = 0 n_tgt_recs = 0 begin while line = rf_rh.readline.chomp if line =~ /^(\S+)\t(\S+)\t(chr\S+)\t([+|-])\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\S+)\t(\S+)/ then symbol = $1 transcript_id = $2 chrom = $3 strand = $4 txStart = $5.to_i txEnd = $6.to_i cdsEnd = $7.to_i cdsEnd = $8.to_i exonCount = $9.to_i exonStartList = $10 exonEndList = $11 n_recs += 1; if (n_recs % 5000 == 0) then puts " #{n_recs}" end if (chrom == tgt_chr and strand == tgt_strand) then n_tgt_recs += 1 exon_starts_a = exonStartList.split(",") exon_ends_a = exonEndList.split(",") # Check! if exon_starts_a.length != exonCount or exon_ends_a.length != exonCount then raise "line 98: wrong exon count!" end (0..exonCount-1).each do |i| exon_start_coords_a[i] = exon_starts_a[i].to_i exon_end_coords_a[i] = exon_ends_a[i].to_i end # Total the exon lengths total_exon_len = 0 (0..exonCount-1).each do |i| this_exons_len = exon_end_coords_a[i] - exon_start_coords_a[i] total_exon_len =+ this_exons_len end transcript_ids_and_lens_h[transcript_id] = total_exon_len #if transcript_ids_a.length == 0 then transcript_ids_a = [transcript_id] #else transcript_ids_a << transcript_id #end exon_start_recs_hoa[transcript_id] = [ symbol, total_exon_len, exonCount, exon_start_coords_a ] exon_end_recs_hoa[transcript_id] = [ symbol, total_exon_len, exonCount, exon_end_coords_a ] end else #Skip this record: wrong chr or strand end end rescue EOFError rf_rh.close #transcript_ids_and_lens_h.sort! puts " read #{n_recs} transcript records" puts " retained #{transcript_ids_and_lens_h.length} transcript IDs on #{tgt_chr}" return transcript_ids_and_lens_h, exon_start_recs_hoa, exon_end_recs_hoa end end ################################################################################ =begin Test file 1 3041046 A 1 1 3041047 C 1 1 3041048 A 1 1 3041049 C 1 1 3041050 A 1 1 3041051 C 1 1 3041052 A 1 1 3041053 C 1 1 3041054 A 1 1 3041055 C 1 We want a hash: blah_h[3041046] = 1 =end def get_pileup_data_for_a_chr( p_rh ) pileup_coord_ht_h = Hash.new recs_read = 0 begin while line = p_rh.readline.chomp if line =~ /^(\S+)\s+(\d+)\s+(\S)\s+(\d+)/ then chrom = $1 coord = $2.to_i base = $3 height = $4.to_i pileup_coord_ht_h[coord] = height recs_read += 1; if (recs_read % 500000 == 0) then puts " #{recs_read}" end end end rescue EOFError p_rh.close puts " read #{recs_read} pileup records" return pileup_coord_ht_h end end ################################################################################ ### puts "1. Read the transcript information from a UCSC table" #---------------------------------------------------------------------------------------------------------- #transcript_ids_and_lens_h, exon_start_recs_hoa, exon_end_recs_hoa = read_refflat_data( rf_rh, tgt_chr, tgt_strand ) #---------------------------------------------------------------------------------------------------------- ### # We should create a hash in which the key is the coordinate and the value is the pileup height puts "\n2. Read in the pileup file." #------------------------------------------------------ pileup_coord_ht_h = get_pileup_data_for_a_chr( p_rh ) #------------------------------------------------------ ### puts "\n3.Process the transcript exon information and the pileup file" transcript_ids_a = transcript_ids_and_lens_h.keys.sort transcript_ids_a.each do |id| xy_aoa = Array.new start_recs_for_this_id_a = exon_start_recs_hoa[id] end_recs_for_this_id_a = exon_end_recs_hoa[id] gene_symbol = start_recs_for_this_id_a[0] nbr_exons_in_transcript = start_recs_for_this_id_a[1] exon_start_coords_for_an_id_a = start_recs_for_this_id_a[2] exon_end_coords_for_an_id_a = end_recs_for_this_id_a[2] # For each exon, get the pileup profile # Then combine these into a single profile in absolute coords # Then combine them into a single profile in relative coords # Then convert into a 100-bin standard profile =begin 1 4883498 G 121 1 4883499 A 121 1 4883500 A 121 1 4883501 A 121 1 4883502 T 121 1 4883503 G 121 1 4883504 G 115 1 4883505 C 110 1 4883506 T 106 1 4883507 A 101 =end exon_start = 4883498 exon_end = 4883507 (exon_start..exon_end).each do |coord| ht = pileup_coord_ht_h[coord].to_i xy_aoa << [coord,ht] end puts "loaded xy... for #{id}" end