renv::init()
## * Using Bioconductor version '3.15'.
renv::restore()
## * The library is already synchronized with the lockfile.
knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(warning=FALSE, message=FALSE)
# Load libraries
library(tidyverse)
library(reshape2)
library(stringr)
library(SingleCellExperiment)
library(zellkonverter)
library(DT)
library(cowplot)
library(ggpattern)
library(ggplot2)
library(ggsci)
library(gridGraphics)
library(circlize)
library(ComplexHeatmap)
library(cytomapper)
# Plot setup
# Set fontsizes used throughout this script
title.fontsize <- 10
axis_title.fontsize <- 8
# Specify which patters are available for plotting
possible_patterns <- c("none", "pch", "stripe", "circle", "crosshatch")
# Set general input paths to all analysis files
analysis.path <- snakemake@params[["out_path"]]
# Set path to masks
masks.path <- snakemake@params[["masks_path"]]
# Specify names of parameters used to do the parameter sweep
parameters <- c("k", "d", "m")
# Specify image used for testing
test.img <- snakemake@params[["test_names"]]
## General helper fnc.
# Function to read in any of the outputs of CISI training
# file: path to file to read in
# type: res (results.txt files with correlations), x (X_*.h5ad files with anndata
# object), a (version_*.txt with A/Phi experiment design matrix) or u (gene_modules.csv
# containing the learned dictionary)
read_results <- function(file, type){
# Make sure file is not empty
if (file.size(file)!=0L){
# Read out parameter types of parameter sweep and the values used to create
# particular output 'file'
params <- gsub("/[^/]+$", "", gsub(".*parameter_sweep/", "", file))
params <- str_split(str_split(params, "/")[[1]], "_")
# Define what type of output of CISI is read in
if (type=="res"){
# If type="res" file containing correlation results is read in
# Column gets added to show if results comes from noisy or no noise simulations
res <- read_tsv(file, show_col_types=FALSE) %>%
mutate(simulation=ifelse((grepl("composite", file) | grepl("no_noise", file)),
"no_noise", "noisy"),
folder=gsub("/[^/]+$", "", file)) %>%
relocate("Gene average", .before=1)
} else if (type=="x") {
# If type="x", then anndata object gets read into SCE containing either
# ground truth or simulated/decomposed expressions (normalized and subsetted
# according to what was specified/used in the CISI run)
res <- readH5AD(file)
metadata(res) <- list(ground_truth=ifelse(grepl("X_test", file),
"ground_truth",
"simulated"))
} else if (type=="a"){
# If type="a" file containing experiment matrix is read in specifying
# which proteins are measured in the same channel
res <- read.table(file, row.names=1, sep="\t") %>%
dplyr::rename_all(~ (stringr::str_replace_all( ., "\\.", "-" ))) %>%
rownames_to_column(var="channel")
} else if (type=="u"){
# If type="u" file containing dictionary is read in
res <- read.csv(file) %>%
dplyr::rename(protein=X) %>%
dplyr::rename_with(~gsub("X", "", .x, fixed=TRUE)) %>%
melt(id="protein") %>%
dplyr::rename(module=variable, membership=value)
}
# Add information about which type of parameters were evaluated in the parameter
# sweep and what the values were for particular file
for (p in params){
if (type=="x"){
metadata(res)[[p[1]]] <- p[2]
} else {
res <- res %>%
mutate(!!as.symbol(p[1]):=p[2], .before=1)
}
}
res
}
}
## Plot fnc.
# Plot results from CISI for data: grouped by group on x-axis, cor is used as
# y-value, fill variable is used for coloring and pattern for the patterns in the
# barplot
plot_results <- function(df, cor, group, fill, pattern){
# Create barplot
barplot <- ggplot(df, aes(x=!!sym(group), y=!!sym(cor),
fill=!!sym(fill), pattern=!!sym(pattern))) +
geom_bar_pattern(stat="identity",
position=position_dodge(preserve="single"),
width=0.6,
color="black",
pattern_fill="black",
pattern_angle=45,
pattern_density=0.3,
pattern_spacing=0.01,
pattern_key_scale_factor=0.6) +
scale_y_continuous(limits=c(0, 1)) +
scale_fill_npg() +
scale_pattern_manual(values=possible_patterns[1:length(unique(df[[pattern]]))],
labels=levels(df[[pattern]])) +
labs(x=group, y=cor, pattern=pattern) +
guides(pattern=guide_legend(override.aes=list(fill="white")),
fill=guide_legend(override.aes=list(pattern="none"))) +
theme_cowplot(title.fontsize)
barplot
}
# Plot results from CISI for data: line plots of results for param in a grid
# for all other parameters (which are fixed then)
plot_results_grid <- function(df, param){
# Create grid of lineplots
lineplot <- ggplot(df, aes(!!sym(param), `Gene average`)) +
facet_grid(vars(!!sym(parameters[parameters!=param][1])), vars(!!sym(parameters[parameters!=param][2])),
labeller=label_both) +
theme_cowplot(title.fontsize) +
geom_point(colour=pal_npg("nrc")("1")) +
panel_border()
lineplot
}
# Plot barplot of correlations per protein
plot_protein_cor <- function(X.cor){
# Create barplot with proteins on y-axis and their correlations on the y-axis
protein.plot <- ggplot(X.cor %>%
dplyr::select(protein, correlation) %>%
ungroup() %>%
distinct(),
aes(x=reorder(protein, correlation), y=correlation, fill=protein)) +
geom_bar(stat="identity") +
theme_cowplot(title.fontsize) +
scale_fill_igv() +
guides(fill=FALSE) +
xlab("protein") +
coord_flip()
protein.plot
}
# Plot results of expression values in "layer" of CISI vs ground truth for
# protein of interest poi and segmented according to masks.list
plot_cells <- function(sce.list, masks.list, poi, layer="exprs"){
# Set colours for poi
colour.cells <- list(el=c("#FFFFFF", pal_npg("nrc")("8")[8]))
names(colour.cells) <- poi
# Since all the images and legends are returned individually, we have to
# subset and reorder them such that the simulated and ground truth are next to
# each other and only one legend is shown
img.list <- list()
idx <- c()
for (el in sce.list){
# Select if normal counts or transformed counts should be plotted
if (layer=="none"){
p <- plotCells(mask=masks.list, object=el,
cell_id="ObjectNumber", img_id="sample_id", colour_by=poi,
return_plot=TRUE, image_title=list(cex=1),
colour=colour.cells, display="single",
scale_bar=list(cex=1, lwidth=5),
legend=list(colour_by.title.cex=fontsize(title.fontsize),
colour_by.labels.cex=fontsize(axis_title.fontsize)))
} else {
p <- plotCells(mask=masks.list, object=el,
cell_id="ObjectNumber", img_id="sample_id", colour_by=poi,
return_plot=TRUE, image_title=list(cex=1),
colour=colour.cells, display="single",
scale_bar=list(cex=1, lwidth=5),
legend=list(colour_by.title.cex=fontsize(title.fontsize),
colour_by.labels.cex=fontsize(axis_title.fontsize)),
exprs_values=layer)
}
# Add plot to image list
p.gg <- lapply(p$plot, function(x){ggdraw(x, clip="on")})
img.list <- append(img.list, p.gg) %>%
map(~ .x + theme(plot.margin=margin(l=0, r=0, t=0.01, b=0.01, unit="cm")))
idx <- c(idx, seq_along(p.gg))
}
# Reorder image list
names(img.list) <- make.unique(names(img.list))
idx <- order(idx)
img.list[idx]
}
# Plot heatmap of U with specified "title"
plot_U <- function(u.temp, title, thresh=0.9){
# Pivot dataframe to long format
u.temp <- u.temp %>%
dplyr::select(-parameters) %>%
pivot_wider(names_from=module, values_from=membership) %>%
column_to_rownames(var="protein") %>%
as.matrix()
# Compute module membership for all proteins
u.membership <- apply(u.temp, 2, function(u.col){
cs <- cumsum(-sort(-u.col^2))
cs <- cs / cs[length(cs)]
cs <- ifelse(cs<=thresh, 1, 0)
cs[which(cs==0)[1]] <- 1
cs <- cs[order(match(names(cs), names(u.col)))]
cs
})
# Create heatmap of module membership
u.heatmap <- Heatmap(u.membership,
column_title=title,
col=structure(c("grey", pal_npg("nrc")("1")[1]),
names=c("0", "1")),
show_heatmap_legend=FALSE,
show_row_dend=FALSE,
row_names_gp=gpar(fontsize=axis_title.fontsize),
column_names_gp=gpar(fontsize=axis_title.fontsize),
column_title_gp=gpar(fontsize=title.fontsize),
rect_gp=gpar(col="white", lwd=1))
u.heatmap
}
# Plot heatmap of A with specified "title"
plot_A <- function(a, title){
# Transform into matrix with channels as rownames
a.matrix <- a %>%
dplyr::select(-parameters) %>%
column_to_rownames("channel") %>%
as.matrix()
a.heatmap <- Heatmap(a.matrix,
column_title=title,
col=colorRamp2(c(0, 1), colors=c("grey", pal_npg("nrc")("1"))),
show_heatmap_legend=FALSE,
show_row_dend=FALSE,
row_names_gp=gpar(fontsize=axis_title.fontsize),
column_names_gp=gpar(fontsize=axis_title.fontsize),
column_title_gp=gpar(fontsize=title.fontsize),
rect_gp=gpar(col="white", lwd=1))
a.heatmap
}
First, we read in the results from the CISI parameter sweep.
## Read results
# Read in all results (correlations) into one dataframe
results.files <- list.files(analysis.path, 'simulation_results.txt',
full.names=TRUE, recursive=TRUE)
results.df <- lapply(results.files, read_results, type="res") %>%
bind_rows() %>%
arrange(desc(`Gene average`))
## Specify best run
# (if show_results parameters are fixed, then the best is selected considering
# only the specified parameters)
best_run.df <- results.df
for (p in names(snakemake@params[['show_results']])){
if (!stringi::stri_isempty(snakemake@params[['show_results']][[p]])){
best_run.df <- best_run.df %>%
dplyr::filter(!!as.symbol(p)==snakemake@params[['show_results']][[p]])
}
}
best_run.df <- best_run.df[1, ]
## Read X_test and X_simulated
# Specify X_test and X_simulated files from best run
X.files <- list.files(best_run.df$folder[1], "X_", full.names=TRUE, recursive=TRUE)
# Read in sce experiments from saved anndata
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
sce.temp
})
## Read U
# Specify U file from best run
u.file <- list.files(best_run.df$folder[1], "gene_modules.csv",
full.names=TRUE, recursive=TRUE)
u <- read_results(u.file, type="u")
## Read A
# Specify A/Phi file from best run
a.file <- list.files(best_run.df$folder[1], "version_*",
full.names=TRUE, recursive=TRUE)
a.file <- a.file[grepl(paste0("version_", best_run.df$version[1], ".txt"), a.file)]
a <- read_results(a.file, type="a")
## Read in masks for data
masks <- loadImages(masks.path, as.is=TRUE)
mcols(masks) <- DataFrame(sample_id=names(masks))
## Subset test images
# Chose first image from test images for plotting in this script (only if multiple
# test images are given)
if (class(test.img)=="list"){
test.img <- test.img[[1]]
}
The compressed markers in this run were: SMA, CD20, CD3, CD11c,
Ki-67, CD8a, CD4, panCK.
The tested parameters were:
k = [1, 2, 3, 4], d = [5, 10, 20, 50, 80] and m = [4, 5, 6].
Out of 60 runs (tested parameter combinations), 60 run successfully.
The best parameter combination is: k = 4, d = 10, m = 4 (path:
/mnt/bb_dqbm_volume/analysis/Tonsil_th152/training/subset/proof_of_concept_max_worst_protein/parameter_sweep/k_4/d_10/m_4)
with mean protein correlation (pearson) of 82%.
The table underneath shows the training results for all parameter combinations in the parameter sweep.
# Print datatable of results with some nice search fncs. as an overview
datatable(results.df, rownames=FALSE, filter="top",
options=list(pageLength=5, scrollX=T))
For each parameter combination from the parameter sweep, plot the average protein correlations. This used to determine the best parameter combination.
## For each different measurement of training results, plot barplot
# Melt dataframe for plotting
average_protein_cor <- results.df %>%
filter(simulation=="no_noise") %>%
dplyr::select(c("Gene average", parameters)) %>%
mutate(!!as.symbol(parameters[1]):=factor(!!as.symbol(parameters[1]),
levels=sort(as.integer(unique(results.df[[parameters[1]]])))),
!!as.symbol(parameters[2]):=factor(!!as.symbol(parameters[2]),
levels=sort(as.integer(unique(results.df[[parameters[2]]])))),
!!as.symbol(parameters[3]):=factor(!!as.symbol(parameters[3]),
levels=sort(as.integer(unique(results.df[[parameters[3]]])))))
# Create barplot
results.barplot <- plot_results(average_protein_cor, "Gene average", parameters[1],
parameters[2], parameters[3])
print(results.barplot)
To analyse the effect of a parameter while keeping all other parameters fixed, we also create grids of line plots, where the fixed parameters are the rows and columns.
# Loop through parameter for which to plot line plot while keeping all other
# parameters fixed in a grid
for (param in parameters) {
cat("####", param, "\n")
# Create plot
results.gridplot <- plot_results_grid(average_protein_cor, param)
print(results.gridplot)
cat("\n\n")
}
Correlation between ground truth and decomposed results per protein
for
asinh transformed counts of the best run.
aoi <- "exprs"
# Calculate correlations between ground truth and simulated data for each protein
X.cor <- lapply(X.list, function(sce){
counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable)
}) %>% bind_rows() %>%
group_by(protein, cell) %>%
summarise_all(na.omit) %>%
group_by(protein) %>%
mutate(correlation=cor(ground_truth, simulated))
# Plot correlations for each protei
protein.plot <- plot_protein_cor(X.cor) + ylim(0, 1)
print(protein.plot)
Expression values for cells in test image for the worst and best performing protein.
# Find worst and best performing protein
pois <- c(X.cor[which.min(X.cor$correlation), "protein"],
X.cor[which.max(X.cor$correlation), "protein"])
names(pois) <- c("Worst", "Best")
# Call plot_cells to get individual plots for test roi for decomposed and true
# results
for (n in names(pois)){
cat("####", n, "\n")
img <- plot_cells(X.list, masks, pois[[n]])
# Plot decomposed vs true results for test roi (002)
img <- plot_grid(plotlist=append(img[grepl(test.img,
names(img))],
img[grepl("legend",
names(img))]), ncol=2,
labels=unlist(lapply(X.list, function(sce){metadata(sce)$ground_truth})),
label_size=15, hjust=c(-2, -1.5),
vjust=1, scale=0.9)
print(img)
cat("\n\n")
}
# Plot U for best run
u.plot <- plot_U(u, paste0("U\n(dictionary size: ", length(unique(u$module)), ")"))
print(u.plot)
# Plot A for best run
a.plot <- plot_A(a, "A")
print(a.plot)