Before exploring the expression data, I want to obtain a list of mutation features that help predict COO. Here, I define mutation features as being a mutated hotspot or gene. This information will help evaluate downstream clustering solutions.
Two methods will be compared below. Both will be using the Lymphp2Cx COO labels as an approximation of ground truth. The first approach will use Fisher’s exact test to identify mutation features that are significantly associated with COO. The second approach will leverage a random forest model to rank the importance of mutation features.
Before ranking mutation features, we need to identify mutation hotspots, because some are known to be associated with COO (e.g. MYD88 and EZH2 hotspot mutations). For this, we are identifying recurrently altered codons, i.e. those that are present in at least 5% of patients.
hotspots <-
muts %>%
dplyr::group_by(gene, codon) %>%
dplyr::mutate(num_affected = dplyr::n_distinct(patient),
hotspot = num_affected > length(patients_rnaseq) * min_recur,
hotspot = ifelse(type != "missense", FALSE, hotspot),
feature = ifelse(hotspot, paste0(gene, "_", codon), paste0(gene, "_other"))) %>%
dplyr::ungroup()
mut_features <-
hotspots %>%
dplyr::select(patient, feature) %>%
dplyr::distinct() %>%
tidyr::spread(patient, patient) %>%
tidyr::gather(patient, status, -feature) %>%
dplyr::mutate(mutated = !is.na(status),
status = NULL) %>%
dplyr::left_join(coo, by = "patient")
heatmap_mut_annot <-
coo_annot %>%
tibble::rownames_to_column("patient") %>%
dplyr::filter(coo_ns %in% c("ABC", "GCB"),
patient %in% mut_features$patient) %>%
tibble::column_to_rownames("patient")
mut_feat_df <-
mut_features %>%
dplyr::mutate(mutated = ifelse(mutated, 1, 0)) %>%
tidyr::spread(feature, mutated)
mut_feat_matrix_all <-
mut_feat_df %>%
dplyr::select(-coo_ns) %>%
as.data.frame() %>%
set_rownames(c()) %>%
tibble::column_to_rownames("patient") %>%
as.matrix() %>%
t() %>%
extract(, order(coo %>% dplyr::filter(patient %in% mut_features$patient) %$% coo_ns))
mut_feat_matrix <-
mut_feat_matrix_all %>%
extract(, colnames(.) %in% rownames(heatmap_mut_annot))
hotspots %>%
dplyr::filter(hotspot) %>%
dplyr::select(gene, codon, num_affected) %>%
dplyr::distinct() %>%
dplyr::arrange(desc(num_affected)) %>%
knitr::kable(caption = "Recurrently altered gene codons (hotspots)")
gene | codon | num_affected |
---|---|---|
EZH2 | 646 | 40 |
BCL10 | 213 | 38 |
MYD88 | 273 | 37 |
MEF2B | 83 | 20 |
CD79B | 197 | 20 |
KMT2C | 1685 | 18 |
A simple approach to ranking mutation features is to statistically test for an association between a feature and COO status. Because this is count data, the Fisher’s exact test can be used. After performing BH multiple test correction, we filter for features with a minimum q-value of 10%.
mut_fisher <-
mut_features %>%
dplyr::filter(coo_ns %in% c("ABC", "GCB")) %>%
dplyr::group_by(feature) %>%
tidyr::nest() %>%
dplyr::mutate(fisher = purrr::map(data, ~fisher.test(.$mutated, .$coo_ns)),
p_val = purrr::map_dbl(fisher, "p.value"),
q_val = p.adjust(p_val, method = p_adjust_method)) %>%
dplyr::select(-data, -fisher) %>%
dplyr::arrange(q_val) %>%
dplyr::filter(q_val < qval_cutoff)
knitr::kable(mut_fisher)
feature | p_val | q_val |
---|---|---|
BCL2_other | 0.0000000 | 0.0000000 |
TNFRSF14_other | 0.0000000 | 0.0000000 |
GNA13_other | 0.0000000 | 0.0000002 |
MYD88_273 | 0.0000000 | 0.0000004 |
SOCS1_other | 0.0000018 | 0.0000274 |
CD79B_197 | 0.0000027 | 0.0000347 |
EZH2_646 | 0.0000061 | 0.0000684 |
CREBBP_other | 0.0000091 | 0.0000884 |
P2RY8_other | 0.0000454 | 0.0003936 |
MPEG1_other | 0.0000764 | 0.0005956 |
CD79B_other | 0.0001144 | 0.0007437 |
TMSB4X_other | 0.0001068 | 0.0007437 |
PIM1_other | 0.0001265 | 0.0007590 |
MEF2B_83 | 0.0002507 | 0.0013968 |
DDX3X_other | 0.0030408 | 0.0154782 |
IRF8_other | 0.0031750 | 0.0154782 |
ARID1A_other | 0.0044460 | 0.0203992 |
IL4R_other | 0.0087501 | 0.0359213 |
IRF4_other | 0.0085899 | 0.0359213 |
BCL10_other | 0.0125765 | 0.0490482 |
SGK1_other | 0.0160632 | 0.0596632 |
HIST1H1E_other | 0.0207389 | 0.0729165 |
S1PR2_other | 0.0224359 | 0.0729165 |
SPEN_other | 0.0217264 | 0.0729165 |
TBL1XR1_other | 0.0234801 | 0.0732581 |
heatmap_mut_fisher <-
mut_feat_matrix %>%
extract(mut_fisher$feature, ) %>%
pheatmap::pheatmap(cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE,
color = colours$mutated, legend = FALSE,
annotation_col = heatmap_mut_annot,
annotation_colors = colours,
gaps_col = sum(heatmap_mut_annot$coo_ns == "ABC"))
Mutation features can also be ranked using machine learning. Here, we are training a random forest model to predict COO from the mutation features. A side-product from training this model is the feature importance, which we can use as a ranking.
# Remove any features with near-zero variance
nz_feats <- caret::nearZeroVar(mut_feat_df)
abc_gcb_rows <- mut_feat_df$coo_ns %in% c("ABC", "GCB")
mut_rf_patients <- mut_feat_df[abc_gcb_rows, 1]
mut_feat_df_no_nz <- mut_feat_df[abc_gcb_rows, -c(1, nz_feats)]
mut_ctrl <- caret::trainControl(method = "cv", number = 10)
mut_model <-
caret::train(coo_ns ~ ., mut_feat_df_no_nz, method = "rf",
preProcess = "scale", trControl = mut_ctrl, tuneLength = 10)
mut_model
Random Forest
277 samples
63 predictor
2 classes: 'ABC', 'GCB'
Pre-processing: scaled (63)
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 250, 249, 250, 249, 250, 249, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8089947 0.5438280
8 0.8271164 0.6110994
15 0.8234127 0.6019521
22 0.8162698 0.5854169
29 0.8161376 0.5873333
35 0.8052910 0.5618670
42 0.8052910 0.5618670
49 0.8089947 0.5751352
56 0.8089947 0.5726198
63 0.8125661 0.5821159
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 8.
mut_imp <- caret::varImp(mut_model, scale = FALSE)
mut_rf <-
mut_imp$importance %>%
tibble::rownames_to_column("feature") %>%
dplyr::rename(importance = Overall) %>%
dplyr::arrange(desc(importance)) %>%
head(nrow(mut_fisher))
knitr::kable(mut_rf)
feature | importance |
---|---|
BCL2_other | 10.227513 |
TNFRSF14_other | 7.529022 |
MYD88_273 | 6.400298 |
GNA13_other | 5.593403 |
SOCS1_other | 5.329206 |
CREBBP_other | 4.784726 |
P2RY8_other | 3.715943 |
CD79B_197 | 3.618199 |
TMSB4X_other | 3.443138 |
EZH2_646 | 2.811936 |
PIM1_other | 2.639072 |
MPEG1_other | 2.485901 |
HIST1H1E_other | 2.459930 |
CD79B_other | 2.111793 |
BCL10_213 | 1.951250 |
KMT2D_other | 1.917125 |
SGK1_other | 1.850612 |
ARID1A_other | 1.816322 |
IRF4_other | 1.801406 |
BCL10_other | 1.713638 |
SPEN_other | 1.660783 |
BIRC6_other | 1.622855 |
TP53_other | 1.622377 |
IL4R_other | 1.571859 |
ADAMTS12_other | 1.539695 |
heatmap_mut_rf <-
mut_feat_matrix %>%
extract(mut_rf$feature, ) %>%
pheatmap::pheatmap(cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE,
color = colours$mutated, legend = FALSE,
annotation_col = heatmap_mut_annot,
annotation_colors = colours,
gaps_col = sum(heatmap_mut_annot$coo_ns == "ABC"))
Unsurprisingly, both approaches share several highly ranked features. A hybrid approach here considers the overlap between both methods, which should select the most informative genes.
When I consider the overlap, there are several genes that visually do not seem to discriminate well between ABC and GCB cases. One issue is that we are using the Lymph2Cx labels, which we guess are not perfect. Therefore, some cases might be on the right side of the heatmap, so to speak.
mut_hybrid_df <-
mut_feat_df %>%
tidyr::gather(feature, mutated, -patient, -coo_ns) %>%
dplyr::filter(feature %in% intersect(mut_fisher$feature, mut_rf$feature)) %>%
dplyr::group_by(feature, coo_ns) %>%
dplyr::summarise(num_mutated = sum(mutated)) %>%
tidyr::spread(coo_ns, num_mutated) %>%
dplyr::mutate(ratio = ABC / GCB) %>%
dplyr::arrange(ratio)
mut_hybrid_all <- mut_hybrid_df$feature
heatmap_mut_hybrid <-
mut_feat_matrix %>%
extract(mut_hybrid_all, ) %>%
pheatmap::pheatmap(cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE,
color = colours$mutated, legend = FALSE,
annotation_col = heatmap_mut_annot,
annotation_colors = colours,
gaps_col = sum(heatmap_mut_annot$coo_ns == "ABC"))
Here, I’m only considering genes with the most extreme ratio of ABC-mutated cases versus GCB-mutated cases. This set of more amenable to simple counting of the number of ABC/GCB-associated mutation features.
mut_hybrid_df_top <-
mut_hybrid_df %>%
dplyr::filter(ratio < 0.1 | ratio > 2)
heatmap_mut_hybrid_top <-
mut_feat_matrix %>%
extract(mut_hybrid_df_top$feature, ) %>%
pheatmap::pheatmap(cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE,
color = colours$mutated, legend = FALSE,
annotation_col = heatmap_mut_annot,
annotation_colors = colours,
gaps_col = sum(heatmap_mut_annot$coo_ns == "ABC"))
gcb_features <- mut_hybrid_df_top %>% dplyr::filter(ratio < 1) %$% feature
abc_features <- mut_hybrid_df_top %>% dplyr::filter(ratio > 1) %$% feature
discretize <- function(x) {
dplyr::recode(x, `0` = "0", `1` = "1", `2` = "2", .default = "3+")
}
num_feats <-
mut_features %>%
dplyr::filter(mutated) %>%
dplyr::group_by(patient, coo_ns) %>%
dplyr::summarise(gcb_num_feats = discretize(sum(feature %in% gcb_features)),
abc_num_feats = discretize(sum(feature %in% abc_features)))
heatmap_mut_hybrid_num_feats_annot <-
num_feats %>%
as.data.frame() %>%
tibble::column_to_rownames("patient") %>%
dplyr::ungroup()
barplot_mut_hybrid_num_feats <-
heatmap_mut_hybrid_num_feats_annot %>%
dplyr::select(-coo_ns,
`Number of ABC features` = abc_num_feats,
`Number of GCB features` = gcb_num_feats) %>%
tidyr::gather(type, count) %>%
dplyr::count(type, count) %>%
ggplot(aes(x = count, y = n)) +
geom_bar(stat = "identity") +
facet_grid(~type) +
labs(title = "Number of patients per mutation feature count",
x = "Number of ABC/GCB features", y = "Number of patients")
barplot_mut_hybrid_num_feats
# discretize_probs <- function(x) {
# cut(x, breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1),
# labels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%")) %>%
# as.character()
# }
discretize_diff <- function(x) {
cut(x, breaks = c(-1.1, -0.6, -0.2, 0.2, 0.6, 1.1),
labels = c("GCB 60-100%", "GCB 20-60%", "Ambiguous", "ABC 20-60%", "ABC 60-100%")) %>%
as.character()
}
# mut_rf_prob_df <-
# predict(mut_model, mut_feat_df_no_nz, "prob") %>%
# dplyr::transmute(abc_rf_prob = discretize_probs(ABC),
# gcb_rf_prob = discretize_probs(GCB)) %>%
# dplyr::bind_cols(mut_rf_patients)
mut_rf_prob_df <-
predict(mut_model, mut_feat_df, "prob") %>%
dplyr::transmute(abc_gcb_diff = discretize_diff(ABC - GCB)) %>%
dplyr::bind_cols(mut_feat_df[,"patient"])
heatmap_mut_rf_prob_annot <-
heatmap_mut_hybrid_num_feats_annot %>%
tibble::rownames_to_column("patient") %>%
dplyr::left_join(mut_rf_prob_df) %>%
tibble::column_to_rownames("patient") %>%
extract(, c("coo_ns", "gcb_num_feats", "abc_num_feats", "abc_gcb_diff"))
barplot_mut_rf_prob_data <-
heatmap_mut_rf_prob_annot %>%
dplyr::select(-dplyr::ends_with("_num_feats"),
`P(ABC) - P(GCB)` = abc_gcb_diff) %>%
tidyr::gather(type, prob, -coo_ns) %>%
dplyr::count(coo_ns, type, prob) %>%
dplyr::mutate(prob = factor(prob, names(colours$abc_gcb_diff)[1:5]))
barplot_mut_rf_prob <-
barplot_mut_rf_prob_data %>%
ggplot(aes(x = prob, y = n, fill = coo_ns)) +
geom_bar(stat = "identity") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = colours$coo_ns) +
place_legend(c(1, 1)) +
labs(title = "Number of patients per RF probability range",
x = "P(ABC) - P(GCB)", y = "Number of patients", fill = "Lymph2Cx COO")
barplot_mut_rf_prob
patient_no_mut <-
mut_feat_df %>%
dplyr::transmute(patient, total = rowSums(.[, -(1:2)])) %>%
dplyr::filter(total < 3) %$%
patient
heatmap_mut_rf_prob_annot_no_mut <-
heatmap_mut_rf_prob_annot %>%
tibble::rownames_to_column("patient") %>%
dplyr::filter(patient %in% patient_no_mut) %>%
dplyr::select(-dplyr::ends_with("_num_feats"),
`P(ABC) - P(GCB)` = abc_gcb_diff,
-patient) %>%
tidyr::gather(type, prob, -coo_ns) %>%
dplyr::count(coo_ns, type, prob) %>%
dplyr::mutate(prob = factor(prob, names(colours$abc_gcb_diff)[1:5]))
barplot_mut_rf_prob_no_mut <-
barplot_mut_rf_prob %+%
heatmap_mut_rf_prob_annot_no_mut +
labs(title = paste0(barplot_mut_rf_prob$labels$title, "\n(Patients with <3 mutations)")) +
place_legend(c(0, 1))
barplot_mut_rf_prob_no_mut
n_patients <- ncol(mut_feat_matrix_all)
lineplot_mut_rf_prob <-
predict(mut_model, mut_feat_df, "prob") %>%
dplyr::bind_cols(mut_feat_df[,c("patient", "coo_ns")]) %>%
tibble::rownames_to_column("id") %>%
dplyr::ungroup() %>%
dplyr::mutate(id = forcats::fct_reorder(id, ABC),
prob = GCB) %>%
ggplot(aes(x = id, y = prob, colour = coo)) +
annotate("rect", xmin = 0, xmax = n_patients, ymin = 0, ymax = 0.5,
fill = colours$coo["ABC"], alpha = 0.1) +
annotate("rect", xmin = 0, xmax = n_patients, ymin = 0.5, ymax = 1,
fill = colours$coo["GCB"], alpha = 0.1) +
geom_point(aes(colour = coo_ns), size = 0.5) +
geom_hline(yintercept = 0.5) +
scale_color_manual(values = colours$coo) +
place_legend(c(0, 0)) +
scale_x_discrete(labels = NULL) +
scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
theme(axis.ticks.x = element_blank()) +
labs(title = "Random forest probabilities for GCB",
x = "Patient", y = "Probability for GCB", colour = "Lymph2Cx COO")
lineplot_mut_rf_prob