Parameter Sweep Evaluation

Setup

renv::init()

## * Using Bioconductor version '3.15'.

renv::restore()

## * The library is already synchronized with the lockfile.

knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(warning=FALSE, message=FALSE)

# Load libraries
library(tidyverse)
library(reshape2)
library(stringr)
library(SingleCellExperiment)
library(zellkonverter)
library(DT)
library(cowplot)
library(ggpattern)
library(ggplot2)
library(ggsci)
library(gridGraphics)
library(circlize)
library(ComplexHeatmap)
library(cytomapper)

# Plot setup
# Set fontsizes used throughout this script
title.fontsize <- 10
axis_title.fontsize <- 8

# Specify which patters are available for plotting
possible_patterns <- c("none", "pch", "stripe", "circle", "crosshatch")

# Set general input paths to all analysis files
analysis.path <- snakemake@params[["out_path"]]

# Set path to masks
masks.path <- snakemake@params[["masks_path"]]

# Specify names of parameters used to do the parameter sweep
parameters <- c("k", "d", "m")

# Specify image used for testing
test.img <- snakemake@params[["test_names"]]

## General helper fnc.

# Function to read in any of the outputs of CISI training
# file: path to file to read in 
# type: res (results.txt files with correlations), x (X_*.h5ad files with anndata
# object), a (version_*.txt with A/Phi experiment design matrix) or u (gene_modules.csv
# containing the learned dictionary)
read_results <- function(file, type){
  # Make sure file is not empty
  if (file.size(file)!=0L){
    # Read out parameter types of parameter sweep and the values used to create
    # particular output 'file'
    params <- gsub("/[^/]+$", "", gsub(".*parameter_sweep/", "", file))
    params <- str_split(str_split(params, "/")[[1]], "_")
    # Define what type of output of CISI is read in
    if (type=="res"){
      # If type="res" file containing correlation results is read in
      # Column gets added to show if results comes from noisy or no noise simulations
      res <- read_tsv(file, show_col_types=FALSE) %>%
        mutate(simulation=ifelse((grepl("composite", file) | grepl("no_noise", file)), 
                                 "no_noise", "noisy"),
               folder=gsub("/[^/]+$", "", file)) %>%
        relocate("Gene average", .before=1)
    } else if (type=="x") {
      # If type="x", then anndata object gets read into SCE containing either
      # ground truth or simulated/decomposed expressions (normalized and subsetted
      # according to what was specified/used in the CISI run)
      res <- readH5AD(file)
      metadata(res) <- list(ground_truth=ifelse(grepl("X_test", file), 
                                                "ground_truth", 
                                                "simulated"))
    } else if (type=="a"){
      # If type="a" file containing experiment matrix is read in specifying
      # which proteins are measured in the same channel
      res <- read.table(file, row.names=1, sep="\t") %>%
        dplyr::rename_all(~ (stringr::str_replace_all( ., "\\.", "-" ))) %>%
        rownames_to_column(var="channel")
    } else if (type=="u"){
      # If type="u" file containing dictionary is read in
      res <- read.csv(file) %>%
        dplyr::rename(protein=X) %>%
        dplyr::rename_with(~gsub("X", "", .x, fixed=TRUE)) %>%
        melt(id="protein") %>%
        dplyr::rename(module=variable, membership=value)
    }
    
    # Add information about which type of parameters were evaluated in the parameter
    # sweep and what the values were for particular file
    for (p in params){
      if (type=="x"){
        metadata(res)[[p[1]]] <- p[2]
      } else {
        res <- res %>%
          mutate(!!as.symbol(p[1]):=p[2], .before=1)
      }
    }
    
    res
  }
}


## Plot fnc.

# Plot results from CISI for data: grouped by group on x-axis, cor is used as
# y-value, fill variable is used for coloring and pattern for the patterns in the
# barplot
plot_results <- function(df, cor, group, fill, pattern){
  # Create barplot
  barplot <- ggplot(df, aes(x=!!sym(group), y=!!sym(cor),
                            fill=!!sym(fill), pattern=!!sym(pattern))) +
    geom_bar_pattern(stat="identity",
                     position=position_dodge(preserve="single"),
                     width=0.6,
                     color="black", 
                     pattern_fill="black",
                     pattern_angle=45,
                     pattern_density=0.3,
                     pattern_spacing=0.01,
                     pattern_key_scale_factor=0.6) +
    scale_y_continuous(limits=c(0, 1)) +
    scale_fill_npg() +
    scale_pattern_manual(values=possible_patterns[1:length(unique(df[[pattern]]))], 
                         labels=levels(df[[pattern]])) +
    labs(x=group, y=cor, pattern=pattern) + 
    guides(pattern=guide_legend(override.aes=list(fill="white")),
           fill=guide_legend(override.aes=list(pattern="none"))) +
    theme_cowplot(title.fontsize)
  
  barplot
}

# Plot results from CISI for data: line plots of results for param in a grid
# for all other parameters (which are fixed then)
plot_results_grid <- function(df, param){
  # Create grid of lineplots
  lineplot <- ggplot(df, aes(!!sym(param), `Gene average`)) +
    facet_grid(vars(!!sym(parameters[parameters!=param][1])), vars(!!sym(parameters[parameters!=param][2])),
               labeller=label_both) +
    theme_cowplot(title.fontsize) +
    geom_point(colour=pal_npg("nrc")("1")) +
    panel_border()
    
  lineplot
}

# Plot barplot of correlations per protein
plot_protein_cor <- function(X.cor){
  # Create barplot with proteins on y-axis and their correlations on the y-axis
  protein.plot <- ggplot(X.cor %>%
           dplyr::select(protein, correlation) %>%
           ungroup() %>%
           distinct(),
         aes(x=reorder(protein, correlation), y=correlation, fill=protein)) +
    geom_bar(stat="identity") +
    theme_cowplot(title.fontsize) +
    scale_fill_igv() +
    guides(fill=FALSE) +
    xlab("protein") +
    coord_flip()
  
  protein.plot
}

# Plot results of expression values in "layer" of CISI vs ground truth for 
# protein of interest poi and segmented according to masks.list
plot_cells <- function(sce.list, masks.list, poi, layer="exprs"){
  
  # Set colours for poi
  colour.cells <- list(el=c("#FFFFFF", pal_npg("nrc")("8")[8]))
  names(colour.cells) <- poi
  
  # Since all the images and legends are returned individually, we have to
  # subset and reorder them such that the simulated and ground truth are next to
  # each other and only one legend is shown
  img.list <- list()
  idx <- c()
  for (el in sce.list){
    # Select if normal counts or transformed counts should be plotted
    if (layer=="none"){
      p <- plotCells(mask=masks.list, object=el,
                     cell_id="ObjectNumber", img_id="sample_id", colour_by=poi,
                     return_plot=TRUE, image_title=list(cex=1),
                     colour=colour.cells, display="single",
                     scale_bar=list(cex=1, lwidth=5),
                     legend=list(colour_by.title.cex=fontsize(title.fontsize), 
                                 colour_by.labels.cex=fontsize(axis_title.fontsize)))
    } else {
      p <- plotCells(mask=masks.list, object=el,
                     cell_id="ObjectNumber", img_id="sample_id", colour_by=poi,
                     return_plot=TRUE, image_title=list(cex=1),
                     colour=colour.cells, display="single",
                     scale_bar=list(cex=1, lwidth=5),
                     legend=list(colour_by.title.cex=fontsize(title.fontsize), 
                                 colour_by.labels.cex=fontsize(axis_title.fontsize)),
                     exprs_values=layer)
    }
    # Add plot to image list
    p.gg <- lapply(p$plot, function(x){ggdraw(x, clip="on")})
    img.list <- append(img.list, p.gg) %>% 
      map(~ .x + theme(plot.margin=margin(l=0, r=0, t=0.01, b=0.01, unit="cm"))) 
    idx <- c(idx, seq_along(p.gg))
  }
  
  # Reorder image list
  names(img.list) <- make.unique(names(img.list))
  idx <- order(idx)
  img.list[idx]
}

# Plot heatmap of U with specified "title" 
plot_U <- function(u.temp, title, thresh=0.9){
  # Pivot dataframe to long format
  u.temp <- u.temp %>%
    dplyr::select(-parameters) %>%
    pivot_wider(names_from=module, values_from=membership) %>%
    column_to_rownames(var="protein") %>%
    as.matrix()
  
  # Compute module membership for all proteins
  u.membership <- apply(u.temp, 2, function(u.col){
    cs <- cumsum(-sort(-u.col^2))
    cs <- cs / cs[length(cs)]
    cs <- ifelse(cs<=thresh, 1, 0)
    cs[which(cs==0)[1]] <- 1
    cs <- cs[order(match(names(cs), names(u.col)))]
    
    cs
  })
  
  # Create heatmap of module membership
  u.heatmap <- Heatmap(u.membership, 
                         column_title=title,
                         col=structure(c("grey", pal_npg("nrc")("1")[1]), 
                                       names=c("0", "1")), 
                         show_heatmap_legend=FALSE, 
                         show_row_dend=FALSE, 
                         row_names_gp=gpar(fontsize=axis_title.fontsize),
                         column_names_gp=gpar(fontsize=axis_title.fontsize),
                         column_title_gp=gpar(fontsize=title.fontsize),
                         rect_gp=gpar(col="white", lwd=1))
  
  u.heatmap
}

# Plot heatmap of A with specified "title" 
plot_A <- function(a, title){
  # Transform into matrix with channels as rownames
  a.matrix <- a %>%
    dplyr::select(-parameters) %>%
    column_to_rownames("channel") %>%
    as.matrix()
  
  a.heatmap <- Heatmap(a.matrix, 
                       column_title=title,
                       col=colorRamp2(c(0, 1), colors=c("grey", pal_npg("nrc")("1"))), 
                       show_heatmap_legend=FALSE, 
                       show_row_dend=FALSE, 
                       row_names_gp=gpar(fontsize=axis_title.fontsize),
                       column_names_gp=gpar(fontsize=axis_title.fontsize),
                       column_title_gp=gpar(fontsize=title.fontsize),
                       rect_gp=gpar(col="white", lwd=1))
  
  a.heatmap
}

Read Inputs

First, we read in the results from the CISI parameter sweep.

## Read results
# Read in all results (correlations) into one dataframe
results.files <- list.files(analysis.path, 'simulation_results.txt',
                            full.names=TRUE, recursive=TRUE)

results.df <- lapply(results.files, read_results, type="res") %>% 
  bind_rows() %>%
  arrange(desc(`Gene average`))


## Specify best run 
# (if show_results parameters are fixed, then the best is selected considering
# only the specified parameters)
best_run.df <- results.df
for (p in names(snakemake@params[['show_results']])){
  if (!stringi::stri_isempty(snakemake@params[['show_results']][[p]])){
    best_run.df <- best_run.df %>%
      dplyr::filter(!!as.symbol(p)==snakemake@params[['show_results']][[p]])
  }
}
best_run.df <- best_run.df[1, ]


## Read X_test and X_simulated
# Specify X_test and X_simulated files from best run 
X.files <- list.files(best_run.df$folder[1], "X_", full.names=TRUE, recursive=TRUE)

# Read in sce experiments from saved anndata 
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
  assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
  
  sce.temp
})


## Read U
# Specify U file from best run 
u.file <- list.files(best_run.df$folder[1], "gene_modules.csv",
                      full.names=TRUE, recursive=TRUE)
u <- read_results(u.file, type="u")


## Read A
# Specify A/Phi file from best run 
a.file <- list.files(best_run.df$folder[1], "version_*",
                      full.names=TRUE, recursive=TRUE)
a.file <- a.file[grepl(paste0("version_", best_run.df$version[1], ".txt"), a.file)]
a <- read_results(a.file, type="a") 


## Read in masks for data
masks <- loadImages(masks.path, as.is=TRUE)
mcols(masks) <- DataFrame(sample_id=names(masks))


## Subset test images
# Chose first image from test images for plotting in this script (only if multiple 
# test images are given)
if (class(test.img)=="list"){
  test.img <- test.img[[1]]
}

The compressed markers in this run were: SMA, CD20, CD3, CD11c, Ki-67, CD8a, CD4, panCK.
The tested parameters were:
k = [1, 2, 3, 4], d = [5, 10, 20, 50, 80] and m = [4, 5, 6].
Out of 60 runs (tested parameter combinations), 60 run successfully.
The best parameter combination is: k = 4, d = 10, m = 4 (path: /mnt/bb_dqbm_volume/analysis/Tonsil_th152/training/subset/proof_of_concept_max_worst_protein/parameter_sweep/k_4/d_10/m_4) with mean protein correlation (pearson) of 82%.

Results

Overview

The table underneath shows the training results for all parameter combinations in the parameter sweep.

# Print datatable of results with some nice search fncs. as an overview
datatable(results.df, rownames=FALSE, filter="top", 
          options=list(pageLength=5, scrollX=T))

Barplot of Average Protein Correlation)

For each parameter combination from the parameter sweep, plot the average protein correlations. This used to determine the best parameter combination.

## For each different measurement of training results, plot barplot
# Melt dataframe for plotting
average_protein_cor <- results.df %>%
  filter(simulation=="no_noise") %>%
  dplyr::select(c("Gene average", parameters)) %>%
  mutate(!!as.symbol(parameters[1]):=factor(!!as.symbol(parameters[1]), 
                                    levels=sort(as.integer(unique(results.df[[parameters[1]]])))),
         !!as.symbol(parameters[2]):=factor(!!as.symbol(parameters[2]), 
                                   levels=sort(as.integer(unique(results.df[[parameters[2]]])))),
         !!as.symbol(parameters[3]):=factor(!!as.symbol(parameters[3]), 
                                      levels=sort(as.integer(unique(results.df[[parameters[3]]])))))

# Create barplot
results.barplot <- plot_results(average_protein_cor, "Gene average", parameters[1], 
                                parameters[2], parameters[3])
print(results.barplot)

Effects of Different Parameters

To analyse the effect of a parameter while keeping all other parameters fixed, we also create grids of line plots, where the fixed parameters are the rows and columns.

# Loop through parameter for which to plot line plot while keeping all other
# parameters fixed in a grid
for (param in parameters) {
  cat("####", param, "\n")
  
  # Create plot
  results.gridplot <- plot_results_grid(average_protein_cor, param)
  
  print(results.gridplot)
  cat("\n\n")
}

k

d

m

Best Parameters

Correlations per Protein (arcsinh)

Correlation between ground truth and decomposed results per protein for
asinh transformed counts of the best run.

aoi <- "exprs"

# Calculate correlations between ground truth and simulated data for each protein
X.cor <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) 
  
}) %>% bind_rows() %>%
  group_by(protein, cell) %>%
  summarise_all(na.omit) %>%
  group_by(protein) %>%
  mutate(correlation=cor(ground_truth, simulated))

# Plot correlations for each protei
protein.plot <- plot_protein_cor(X.cor) + ylim(0, 1)
print(protein.plot)

Image results

Espression Values for Worst and Best Proteins (arcsinh)

Expression values for cells in test image for the worst and best performing protein.

# Find worst and best performing protein
pois <- c(X.cor[which.min(X.cor$correlation), "protein"],
          X.cor[which.max(X.cor$correlation), "protein"])
names(pois) <- c("Worst", "Best")

# Call plot_cells to get individual plots for test roi for decomposed and true
# results
for (n in names(pois)){
  cat("####", n, "\n")
  
  img <- plot_cells(X.list, masks, pois[[n]])
  # Plot decomposed vs true results for test roi (002)
  img <- plot_grid(plotlist=append(img[grepl(test.img,
                                             names(img))],
                                   img[grepl("legend",
                                             names(img))]), ncol=2, 
                   labels=unlist(lapply(X.list, function(sce){metadata(sce)$ground_truth})),
                   label_size=15, hjust=c(-2, -1.5), 
                   vjust=1, scale=0.9)
  print(img)
  cat("\n\n")
}

Worst

Best

Matrix designs

U

# Plot U for best run
u.plot <- plot_U(u, paste0("U\n(dictionary size: ", length(unique(u$module)), ")"))
print(u.plot)

A

# Plot A for best run
a.plot <- plot_A(a, "A")
print(a.plot)