Setup

knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(warning=FALSE, message=FALSE)

# Load libraries
library(scater)
library(igraph)
library(dittoSeq)
library(Rphenograph)
library(aricode)
library(viridis)
library(ggrepel)
library(LaplacesDemon)
library(DescTools)
library(RColorBrewer)
library(mclust)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"

# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")

Read Inputs

First, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (created by tonsil_vs_lung.py). (Note: Since the training was done on normalized counts using the original normalization used by CISI, e.g. protein expressions are normalized accross cells, all counts shown in this script are actually normalized. If arcsinh is applied, then this is applied on top of the before mentioned normalization.)

## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, "simulation_results.txt",
                    full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>% 
  bind_rows() 

# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
  if (length(str_split(name, "_")[[1]])==1){
    name
  }
})))

res$training <- unlist(lapply(res$training, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))

## Read U
u.files <- list.files(analysis.path, "gene_modules.csv",
                      full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
u.files <- u.files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", u.files)]
u <- lapply(u.files, read_U, type="training") %>% 
  bind_rows() %>%
  dplyr::rename(dataset=rep)

## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]

# Read processed SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# X.files <- X.files[grepl("_processed", X.files)]

# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Add assay with transformed counts
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
  pat <- metadata(sce.temp)
  metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset, 
                                                    fixed(patterns, ignore_case=TRUE))]
  metadata(sce.temp)$training <- patterns[str_detect(pat$training,
                                                     fixed(patterns, ignore_case=TRUE))]
  assay.name <- names(assays(sce.temp))
  for (a in assay.name[-1]) {
    assay(sce.temp, a) <- NULL
  }
  assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
  
  sce.temp
})

# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
  sce <- runUMAP(sce, exprs_values="exprs") 
  sce <- runTSNE(sce, exprs_values="exprs")
  sce
})

# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i], 
#                                                paste0(gsub("\\..*", "", X.files[i]),
#                                                       "_processed.rds"))})

## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is=TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))

# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is=TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))

## Read original H5ad SCE objects
tonsil.original <- readH5AD(file.path(tonsil.path, "preprocessed_data/spe.h5ad"))
lung.original <- readH5AD(file.path(lung.path, "Lung_sce.h5ad"))
original.list <- list(tonsil.original, lung.original)
names(original.list) <- c("tonsil_tonsil", "lung_lung")

Plot defaults

Next, we set some defaults for consisistend plotting accross the script.

# Define celltype colours used by the rest of the script
celltype.col <- c(brewer.pal(12, "Paired"),
                           brewer.pal(8, "Pastel2")[-c(3,5,8)],
                           brewer.pal(12, "Set3")[-c(2,3,8,9,11,12)])
celltype.col <- c(celltype.col[1:length(unique(colData(lung.original)$celltype))],
                  "black")
names(celltype.col) <- c(as.character(unique(colData(lung.original)$celltype)), NA)
celltype.col <-  celltype.col[colData(lung.original) %>% 
                                as.data.frame() %>%
                                dplyr::pull(celltype) %>% 
                                levels]
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(tonsil.original)
rm(lung.original)

CISI results

In the next part, we have a look at the result statistics and outputs produced by the CISI training function.

Results

Here the correlation result statistics produced by CISI are shown as barplots.

# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
  cat("##### ", measure, "\n")
  
  # Subset dataframe for plotting and add column to combine training and test dataset name
  data_temp <- res %>%
    dplyr::select(dataset, training, simulation, k, !!measure) %>%
    mutate(group=paste(training, dataset, sep="_"))
  
  # Create barplot
  results.barplot <- plot_cisi_results(data_temp, "group", measure, "k")
  
  print(results.barplot)
  cat("\n\n")
}
Overall pearson

Overall spearman

Gene average

Sample average

Sample dist pearson

Sample dist spearman

Gene dist pearson

Gene dist spearman

Matrix coherence (90th ptile)

U

Next, we have a look at the dictionaries U created for the tonsil and lung datasets. For this, the active proteins in a module are defined as the proteins explaining 90% of the cumulative sum of the whole module. These are then coloured accordingly.

Heatmaps of U

# Plot U as heatmaps for the different datasets and k's
cor <- plot_U(u, "k", "dataset")
k = 1

k = 3

Module Comparisons

To illustrate the overlap of modules between datasets, both U’s are shown in the same heatmap and shared modules are coloured accordingly.

# PLot the heatmap of the combined U's for both datasets to show overlap for all k's
plot_U_membership(u, "k", "dataset")
k = 1

k = 3

Mantel Test between U’s

To compute the similarity of U’s between datasets, the mantel correlations is calculated.

# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
  mantel_test(l[[1]], l[[2]])$mantel_r
  }) %>%
  as.data.frame(col.names=names(cor), check.names=FALSE)

knitr::kable(mantel, digits=2,
             col.names=paste0("k=", names(mantel)))
k=1 k=3
0.66 0.51

Image results

To have a better intuition on how CISI performs, we now look at the expression levels of individual cells for the testing image.

Test Image (Tonsil th152, k=1)

# Subset to objects trained and tested on tonsil data
X.tonsil1 <- keep(X.list, function(x){
  metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.tonsil1) <- lapply(X.tonsil1, 
                            function(x){metadata(x)$ground_truth})

# Call plot_cells to get individual plots for each roi for decomposed and true
# results
pois <- c("MPO", "SMA")

for (p in pois){
  cat("##### ", p, "\n")
  
  X.imgList <- plot_cells(X.tonsil1, masks.tonsil, p)
  # Plot decomposed vs true results for test roi (002)
  X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
                                                     names(X.imgList))],
                                     X.imgList[grepl("legend",
                                                     names(X.imgList))]),
                     ncol=2, labels=names(X.tonsil1),
                     label_size=title.fontsize, hjust=c(-2, -1.5), 
                     vjust=1, scale=0.9)
  print(X.img)
  cat("\n\n")
}
MPO

SMA

CD20 of Test Image Overlayed (Tonsil th152, k=1)

To make it easier to see differences in simulated and ground truth images, we overlay them next using opposite colors. Cells which have similar expressions in both simulated and ground truth data should therefore have an orange colour.

# Rename list of tonsil data for nicer titles in plotting
X.tonsil1Renamed <- lapply(names(X.tonsil1), function(n){
  rownames(X.tonsil1[[n]]) <- paste(rownames(X.tonsil1[[n]]),
                                     n, sep="\n")
  reducedDims(X.tonsil1[[n]]) <- NULL
  X.tonsil1[[n]]
})
names(X.tonsil1Renamed) <- names(X.tonsil1)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.tonsil1Renamed)

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.tonsil1)

# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")

# Plot cells coloured according to decomposed and true poi values 
# (if similar in both should have a mixture of colours, e.g. orange) 
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)], 
                       object=X.overlayed,
                       cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
                       return_plot=TRUE,  image_title=list(cex=1),
                       scale_bar=list(cex=fontsize(axis_title.fontsize), lwidth=5),
                       legend=list(colour_by.title.cex=fontsize(axis_title.fontsize), 
                                   colour_by.labels.cex=fontsize(axis_title.fontsize)))

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.tonsil1Renamed)

# Draw plot
ggdraw(X.imgDiff$plot)

Counts results

In the next part, we look at the counts of individual cells for different protein combinations.

Scatterplots of Counts (Immucan Lung)

Plot of counts coloured by defined celltype.

# Subset to dataset trained and tested on the Immucan lung dataset
X.lung_lung <- keep(X.list, function(x){
  metadata(x)$training=="lung" & metadata(x)$dataset=="lung"})
names(X.lung_lung) <- lapply(X.lung_lung,
                             function(x){metadata(x)$ground_truth})
k_s <- unique(unlist(lapply(X.lung_lung, function(sce_temp){metadata(sce_temp)$k})))

for (k in k_s) {
  cat("##### k =", k, "\n")
  
  # Subset for k of interest
  X.lung_lungK <- keep(X.lung_lung, function(x){metadata(x)$k==k})
  
  # Plot counts of proteins of interest of decomposed and true 
  # datasets coloured by different celltypes
  X.Exprs <- plot_grid(plot_exprs(X.lung_lungK, "B", "CD3", "CD20", layer="counts"),
                       plot_exprs(X.lung_lungK, "BnT", "CD3", "CD20", layer="counts"),
                       plot_exprs(X.lung_lungK, "Neutro", "MPO", "CD15", layer="counts"),
                       plot_exprs(X.lung_lungK, "Neutro", "CD3", "MPO", layer="counts"),
                       plot_exprs(X.lung_lungK, "DC", "CD11c", "CD68", layer="counts"),
                       ncol=1)
  print(X.Exprs)
  cat("\n\n")
}
k = 1

k = 3

Per protein results

To have an idea on how individual proteins perform using CISI and not just an overview, we also look at individual protein performances.

Results per Protein (k=1, arcsinh)

Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.

# Add all X_test arrays-of-interested (aoi) together into one dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit)

# For each test/training dataset combination plot for each true vs decomposed
# results a scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
  cat("#####", i, "\n")
  
  proteinPlot <- plot_protein_scatterplot(X.df %>%
                                            dplyr::filter(k=="1" & dataset==i))
  
  print(proteinPlot)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.df)

Correlations per Protein (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.

aoi <- "exprs"
# Calculate correlations between ground truth and simulated data for each protein
# using the assay 'aoi'
X.cor <- lapply(keep(X.list, function(sce){
  metadata(sce)$dataset==metadata(sce)$training
}), function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit) %>%
  group_by(k, dataset, protein) %>%
  summarize(correlation=cor(ground_truth, simulated)) %>%
  dplyr::filter(k=="1") %>%
  dplyr::select(-k)

# Set seed for reproducibility when using Mclust
set.seed(220224)

# Read in original SCE with all samples and proteins and without paper normalization
# of counts/exprs and compute statistics like mean, median, var and SNR on these
# Add this to above correlation dataframe
aoi.original <- "exprs"
X.cor <- lapply(names(original.list), function(n){
  sce <- original.list[[n]]
  counts.long <- as.data.frame(assays(sce)[[aoi.original]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(ground_truth=value,
                  cell=variable) %>%
    dplyr::filter(protein %in% unique(X.cor$protein)) %>%
    group_by(protein) %>%
    mutate(median=median(ground_truth),
           mean=mean(ground_truth),
           var=var(ground_truth),
           dataset=n)
  
  # Add signal to noise ratio and mean of positive class per protein
  mat <- apply(assays(sce)[[aoi.original]], 1, function(x){
    cur_model <- Mclust(x, G = 2)
    mean1 <- mean(x[cur_model$classification == 1])
    mean2 <- mean(x[cur_model$classification == 2])
    
    signal <- ifelse(mean1 > mean2, mean1, mean2)
    noise <- ifelse(mean1 > mean2, mean2, mean1)
    
    return(c(snr=signal/noise, ps=signal))
  }) 
  counts.long <- counts.long %>%
    merge(t(mat) %>% 
            as.data.frame() %>% 
            mutate(protein=colnames(mat)))

  counts.long
}) %>% bind_rows() %>%
  merge(X.cor) %>%
  ungroup()

# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.cor$dataset)) {
  cat("#####", i, "\n")
  
  proteinCorr <- plot_protein_cor(X.cor %>% 
                                    dplyr::filter(dataset==i))
  
  print(proteinCorr)
  cat("\n\n")
}
lung_lung

tonsil_tonsil

Protein distributions coloured by correlation results (k=1, arcsinh)

For a manual inspection of different protein distribution characteristics, we plot the boxplots of the original protein expressions coloured by correlation results. These show: Interquartile range (IQR), 1st and 3rd quantiles, median, mean, max, min and outliers.

# # Have a look at dittoRidgePlots of some good/bad performing proteins for lung and
# # tonsil
# tonsil.originalSubset <- tonsil.original
# colData(tonsil.originalSubset)$dummy <- "1"
# for (p in c("MPO", "Ki-67", "CD15", "CD8a", "CD45RO", "CD303", "PD-L1", "SMA", 
#             "TCF1/TCF7", "CD27", "CD14")){
#   print(dittoRidgePlot(tonsil.originalSubset, var=p, group.by="dummy", assay="exprs") +
#           ggtitle(p) +
#           coord_cartesian(xlim=c(0, 6.2), expand=TRUE))
# }
# lung.originalSubset <- lung.original
# colData(lung.originalSubset)$dummy <- "1"
# for (p in c("MPO", "Ki-67", "CD20", "PD-L1", "PD-1", "CD14")){
#   print(dittoRidgePlot(lung.originalSubset, var=p, group.by="dummy", assay="exprs") +
#           ggtitle(p))
# }

# Plot boxplots of protein expr per dataset coloured by protein correlation results
# (performance)
proteinDist <- plot_protein_dist(X.cor)

print(proteinDist)

Correlations per Protein vs Protein Characteristics (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of ground truth: Median, var, mean and mode_pos.

# Remove indidvidual cell levels to make data frame smaller bc of storage
X.cor <- X.cor %>%
  dplyr::select(-c("cell", "ground_truth")) %>%
  ungroup() %>%
  distinct()

# Define characteristics of interest
coi <- c("median", "var", "mean", "snr")

# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
  cat("#####", i, "\n")
  
  proteinChar <- ggplot(X.cor, 
                        aes(x=!!sym(i), y=correlation, label=protein)) +
    geom_point(color=pal_npg("nrc")("1")) +
    facet_wrap(~ dataset, scales="free", ncol=2) +
    stat_cor(size=2.8,
             label.x.npc="center",
             label.y.npc="bottom") +
    stat_smooth(method="lm",
                formula=y ~ log(x),
                se=FALSE,
                color=pal_npg("nrc")("4")[4], size=2) +
    geom_text_repel(size=2.8)
  
  print(proteinChar)
  cat("\n\n")
}
median

var

mean

snr

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.cor)

Cumulative sum of expression values (k=1)

Another way to analyse the protein-wise results instead of the correlation is shown in the subsequent part. In this case, cells are ranked according to their expression in the ground truth data for each protein and we compute the cumulative sum. Then, we plot this against the cumulative sum of the simulated data where the cell order is determined by the expression values in the ground truth data. The results are shown for k=1.

# Collect all counts for each dataset, rank according to the counts and calculate
# the cumulative sum
cumsum.df <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[["counts"]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    group_by(protein) %>%
    dplyr::arrange(desc(value)) %>%
    mutate(cumsum=cumsum(value),
           x=(0:(length(cumsum)-1)),
           cumsum=cumsum/max(cumsum)) %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable,
                  !!as.symbol(paste(metadata(sce)$ground_truth, "cumsum", sep="_")):=cumsum,
                  !!as.symbol(paste(metadata(sce)$ground_truth, "x", sep="_")):=x) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
  counts.long
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit) 


# Plot cumulative sums for k=1 for each test/training dataset combination
for (i in unique(cumsum.df$dataset)) {
  cat("#####", i, "\n")
  
  protein.cumsum <- ggplot(cumsum.df %>% 
                          dplyr::filter(k=="1" & dataset==i)) +
    geom_step(aes(x=simulated_x, y=simulated_cumsum, color="simulated")) +
    geom_step(aes(x=ground_truth_x, y=ground_truth_cumsum, color="ground_truth")) +
    facet_wrap(~protein, scales="free") +
    scale_x_continuous(guide=guide_axis(n.dodge=2)) +
    ylab("Cumulative sum of counts") +
    xlab("Ordered cells") +
    scale_colour_manual("", 
                        breaks=c("simulated", "ground_truth"),
                        values=c(simulated=pal_npg("nrc")("1"),
                                 ground_truth=pal_npg("nrc")("2")[2]))
  
  print(protein.cumsum)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(cumsum.df)

Reduced dimensions and clustering results

Reduced Dimension Plots (k=1, Immucan_lung, arcsinh)

Below the UMAP and TSNE of the arcsinh counts for the Immucan lung dataset are shown using k=1. The cells are coloured by their celltype (predicted by Daniel and present in the training data already).

# Plot UMAP and TSNE for k=1 and lung data
for (sce in keep(X.lung_lung, function(x){metadata(x)$k=="1"})){
  cat("#####", metadata(sce)$ground_truth, "\n")
  
  # Create empty list for plots
  X.redDimlist <- list()
  # Extract for simulated and true results for UMAP and TSNE and plot and
  # colour by celltype
  for (method in c("UMAP", "TSNE")){
    X.redDimlist <- append(X.redDimlist,
                           list(dittoDimPlot(sce,
                                             var="celltype",
                                             reduction.use=method,
                                             size=0.2,
                                             color.panel=celltype.col,
                                             main="") +
                                  theme(legend.position="bottom") +
                                  guides(color=guide_legend(nrow=4, 
                                                            byrow=TRUE,
                                                            override.aes=list(size=3)))))
    
  }
  # Plot simulated and true reduced dim on top of each other
  X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
                            ncol=2, labels=c("UMAP", "TSNE"),
                            label_size=title.fontsize, hjust=c(-2, -1.5), vjust=1, scale=0.9)
  
  print(X.redDimPlot)
  cat("\n\n")
}
simulated

ground_truth

Cluster overlap between ground truth and simulated data (arcsinh)

To investigate the clustering results of ground truth and simulated data a bit more, we show the overlap of clusters computed using Rphenograph (run with k=100 clusters) individually and compare which clusters overlap using a heatmap.

# Subset to data tested on lung
X.lung <- keep(X.list, function(x){
  metadata(x)$dataset=="lung"})
names(X.lung) <- lapply(X.lung, function(sce){
  paste(metadata(sce)$training, metadata(sce)$dataset,
        metadata(sce)$k, metadata(sce)$ground_truth, sep="_")
})

# Compute clusters for each dataset using Rphenograph (for ground truth data,
# clusters are computed twice and an ARI score between them is calculated as a 
# reference)
all.clusters <- lapply(X.lung, 
  function(sce){
  mat <- t(assay(sce, "exprs"))
  out1 <- Rphenograph(mat, k=100)
  clusters <- as.vector(membership(out1[[2]])) %>%
    as.data.frame() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=".") %>%
    mutate(dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"),
           k=metadata(sce)$k,
           celltype=colData(sce)$celltype,
           cell=colData(sce)$cell_id) 
  
  if (metadata(sce)$ground_truth=="simulated"){
    out2 <- Rphenograph(mat, k=100)
    max_ari <- ARI(factor(membership(out1[[2]])), factor(membership(out2[[2]])))
    clusters <- clusters %>%
      mutate(max_ari=max_ari)
  } 
  
  clusters
}) %>% bind_rows() %>%
  group_by(k, dataset, cell, celltype) %>%
  summarise_all(na.omit) %>%
  ungroup()


# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  plot.new()
  ht_list <- NULL
  for (d in unique(all.clusters$dataset)){
    # Get table of number of cells being in simulated cluster i and ground truth
    # cluster j
    temp.matrix <- table(paste("sim", all.clusters %>% 
                                 dplyr::filter(k==i & dataset==d) %>% 
                                 pull(simulated)), 
                         paste("gt", all.clusters %>% 
                                 dplyr::filter(k==i & dataset==d) %>% 
                                 pull(ground_truth))) %>% 
      as.matrix()
    temp.matrix <- log10(temp.matrix + 10)
    
    # Get barplot annotations of celltypes
    anno.list <- get_anno_clusters(all.clusters %>% 
                                     dplyr::filter(k==i & dataset==d),
                                   celltype.col=celltype.col)
    col_anno <- anno.list[[1]]
    row_anno <- anno.list[[2]]
    
    # Create heatmap
    cluster.overlap <- Heatmap(temp.matrix,
                               show_row_dend=FALSE,
                               show_column_dend=FALSE,
                               top_annotation=col_anno,
                               left_annotation=row_anno,
                               col=magma(100),
                               column_title=d,
                               row_names_gp=gpar(fontsize=axis_title.fontsize+6),
                               column_names_gp=gpar(fontsize=axis_title.fontsize+6),
                               column_title_gp=gpar(fontsize=title.fontsize+6,
                                                    fontface="bold"),
                               rect_gp=gpar(col="white", lwd=1),
                               heatmap_legend_param=list(labels_gp=gpar(fontsize=axis_title.fontsize+6),
                                                         title_gp=gpar(fontsize=axis_title.fontsize+6, 
                                                                       fontface="bold"))) 
    
    # Add heatmap to list
    ht_list <- append(ht_list, 
                      list(grid.grabExpr(draw(cluster.overlap,
                                              heatmap_legend_list=list(
                                                Legend(labels=names(celltype.col), 
                                                       title="Celltypes",
                                                       legend_gp=gpar(fill=celltype.col),
                                                       labels_gp=gpar(fontsize=axis_title.fontsize+6),
                                                       title_gp=gpar(fontsize=axis_title.fontsize+6, 
                                                                     fontface="bold")))))))
  }
  
  print(plot_grid(plotlist=ht_list, nrow=1))
  cat("\n\n")
}
k = 1

k = 3

Kullback-Leibner divergence of clusters (arcsinh)

We also compare the clusters of the ground truth data to the clusters of the simulated data using the Kullback-Leibner divergence of the celltypes.

# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  # Start new plot 
  plot.new()
  # New empty heatmap list
  ht_list.kl <- NULL
  for (d in unique(all.clusters$dataset)){
    # Compute Kullback-Leibner divergence
    clusters.filtered <- all.clusters %>%
      dplyr::filter(k==i & dataset==d) %>%
      dplyr::select(-c("k", "dataset", "max_ari"))
    clusters_sim.kl <- table(clusters.filtered %>% 
                               dplyr::pull(celltype), 
                             paste0("sim_", clusters.filtered %>% 
                               dplyr::pull(simulated))) %>%
      scale(center=FALSE, scale=colSums(.))
    clusters_gt.kl <- table(clusters.filtered %>% 
                               dplyr::pull(celltype), 
                             paste0("gt_", clusters.filtered %>% 
                               dplyr::pull(ground_truth))) %>%
      scale(center=FALSE, scale=colSums(.))
    
    clusters.kl <- apply(clusters_sim.kl, 2, function(x){
      apply(clusters_gt.kl, 2, function(y){
        KLD(x, y)$sum.KLD.py.px})
    })
    
    # Get barplot annotations of celltypes
    anno.list <- get_anno_clusters(all.clusters %>% 
                                     dplyr::filter(k==i & dataset==d),
                                   celltype.col=celltype.col)
    col_anno <- anno.list[[1]]
    row_anno <- anno.list[[2]]

    # Create heatmap
    clusters.klPlot <- Heatmap(t(clusters.kl),
                           show_row_dend=FALSE,
                           show_column_dend=FALSE,
                           top_annotation=col_anno,
                           left_annotation=row_anno,
                           col=colorRamp2(c(0, 0.1, 5), rev(magma(3))), #rev(magma(100)),
                           column_title=d,
                           row_names_gp=gpar(fontsize=axis_title.fontsize+6),
                           column_names_gp=gpar(fontsize=axis_title.fontsize+6),
                           column_title_gp=gpar(fontsize=title.fontsize+6,
                                                fontface="bold"),
                           rect_gp=gpar(col="white", lwd=1),
                           heatmap_legend_param=list(labels_gp=gpar(fontsize=axis_title.fontsize+6),
                                                     title_gp=gpar(fontsize=axis_title.fontsize+6, 
                                                                   fontface="bold"))) 
    
    # Add heatmap to list
    ht_list.kl <- append(ht_list.kl, 
                         list(grid.grabExpr(draw(clusters.klPlot,
                                                 heatmap_legend_list=list(
                                                   Legend(labels=names(celltype.col), 
                                                          title="Celltypes",
                                                          legend_gp=gpar(fill=celltype.col),
                                                          labels_gp=gpar(fontsize=axis_title.fontsize+6),
                                                          title_gp=gpar(fontsize=axis_title.fontsize+6, 
                                                                        fontface="bold")))))))
  }
  
  print(plot_grid(plotlist=ht_list.kl, nrow=1))
  cat("\n\n")
}
k = 1

k = 3

Cluster vs mean expression (arcsinh)

Next, we assess clustering by looking at the median marker expression of each protein in each cluster and then compare it to the celltypes of each cluster. Ideally, each cluster has a median protein expression pattern that fits its main celltype.

for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  # Start new plot 
  plot.new()
  # New empty heatmap list
  ht_list.exprs <- NULL
  for (d in unique(all.clusters$dataset)){
    # Extract arcsinh transformed counts for specified dataset d for ground truth 
    # and simulated data
    cluster_gt.exprs <- assay(X.lung[grepl(i, names(X.lung)) &
                                       grepl(d, names(X.lung)) &
                                       grepl("ground_truth", names(X.lung))][[1]], "exprs") %>%
      as.data.frame() %>%
      mutate(protein=rownames(.)) %>%
      melt() %>%
      dplyr::rename(cell=variable, expr=value)
    cluster_gt.exprs <- all.clusters %>% 
      dplyr::filter(k==i & dataset==d) %>%
      merge(cluster_gt.exprs) %>%
      dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
    cluster_gt.exprs <- cluster_gt.exprs %>%
      mutate(ground_truth=paste("gt", ground_truth)) %>%
      group_by(ground_truth, protein) %>%
      summarise(mean_expr=mean(expr)) %>%
      pivot_wider(names_from=ground_truth, values_from=mean_expr) %>%
      column_to_rownames("protein") %>%
      as.matrix
    
    cluster_sim.exprs <- assay(X.lung[grepl(i, names(X.lung)) &
                                       grepl(d, names(X.lung)) &
                                       grepl("simulated", names(X.lung))][[1]], "exprs") %>%
      as.data.frame() %>%
      mutate(protein=rownames(.)) %>%
      melt() %>%
      dplyr::rename(cell=variable, expr=value)
    cluster_sim.exprs <- all.clusters %>% 
      dplyr::filter(k==i & dataset==d) %>%
      merge(cluster_sim.exprs) %>%
      dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
    cluster_sim.exprs <- cluster_sim.exprs %>%
      mutate(simulated=paste("sim", simulated)) %>%
      group_by(simulated, protein) %>%
      summarise(mean_expr=mean(expr)) %>%
      pivot_wider(names_from=simulated, values_from=mean_expr) %>%
      column_to_rownames("protein") %>%
      as.matrix
    
    # Get barplot annotations of celltypes
    anno.list <- get_anno_clusters(all.clusters %>% 
                                     dplyr::filter(k==i & dataset==d),
                                   celltype.col=celltype.col, direction="normal", 
                                   which_sim="column")
    col_anno <- anno.list[[1]]
    row_anno <- anno.list[[2]]
    
    # Create heatmap
    cluster.exprsPlotGt <- Heatmap(cluster_gt.exprs,
                                 show_row_dend=FALSE,
                                 show_column_dend=FALSE,
                                 top_annotation=col_anno,
                                 col=magma(100),
                                 column_title="Ground truth",
                                 row_names_gp=gpar(fontsize=axis_title.fontsize+6),
                                 column_names_gp=gpar(fontsize=axis_title.fontsize+4),
                                 column_title_gp=gpar(fontsize=title.fontsize+4,
                                                      fontface="bold"),
                                 rect_gp=gpar(col="white", lwd=1),
                                 heatmap_legend_param=list(title="Ground truth",
                                                           labels_gp=gpar(fontsize=axis_title.fontsize+4),
                                                           title_gp=gpar(fontsize=axis_title.fontsize+4, 
                                                                         fontface="bold")))
    cluster.exprsPlotSim <- Heatmap(cluster_sim.exprs,
                                 show_row_dend=FALSE,
                                 show_column_dend=FALSE,
                                 top_annotation=row_anno,
                                 col=magma(100),
                                 column_title="Simulated",
                                 row_names_gp=gpar(fontsize=axis_title.fontsize+4),
                                 column_names_gp=gpar(fontsize=axis_title.fontsize+4),
                                 column_title_gp=gpar(fontsize=title.fontsize+4,
                                                      fontface="bold"),
                                 rect_gp=gpar(col="white", lwd=1),
                                 heatmap_legend_param=list(title="Simulated",
                                                           labels_gp=gpar(fontsize=axis_title.fontsize+4),
                                                           title_gp=gpar(fontsize=axis_title.fontsize+4, 
                                                                         fontface="bold")))
    
    # Add heatmap to list
    ht_list.exprs <- append(ht_list.exprs, 
                            list(grid.grabExpr(draw(cluster.exprsPlotGt + cluster.exprsPlotSim,
                                                    heatmap_legend_list=list(
                                                Legend(labels=names(celltype.col), 
                                                       title="Celltypes",
                                                       legend_gp=gpar(fill=celltype.col),
                                                       labels_gp=gpar(fontsize=axis_title.fontsize+4),
                                                       title_gp=gpar(fontsize=axis_title.fontsize+4, 
                                                                     fontface="bold"))),
                                                column_title=d,
                                                column_title_gp=gpar(fontsize=title.fontsize+4,
                                                                     fontface="bold")))))
  }
  
  print(plot_grid(plotlist=ht_list.exprs, ncol=1))
  cat("\n\n")
}
k = 1

k = 3

Adjusted random index (ARI) between clusters (arcsinh)

Finally, we compute the adjusted random index (ARI) between the clustering of the ground truth and simulated data to assess the overlap.

# Add position for labels to dataframe for easier annotation in ggplot
all.clusters <- all.clusters %>%
  ungroup() %>%
  mutate(point.x=ifelse(k=="3", -0.25, 0.25) + as.numeric(as.factor(dataset)))

# Create barplot for ARI (cluster overlap)
ari.plot <- ggplot(all.clusters %>%
                    group_by(dataset, k) %>%
                    summarize(ARI=ARI(simulated, ground_truth),
                              point.x=point.x,
                              max_ari=max_ari)) +
  geom_bar(aes(x=dataset, y=ARI, fill=k), position="dodge", stat="identity") +
  geom_point(aes(x=point.x, y=max_ari)) +
  scale_fill_npg() +
  coord_flip() +
  ylim(0, 1)

print(ari.plot)

Cluster overlap (arcsinh)

Since the ARI seems to be pretty bad in all cases, we have a look at another metric regarding cluster overlap between ground truth and simulated clusters. For that, we find the closest cluster in the simulated data for each ground truth cluster and compute the Jaccard index of the two.

# Calculate celltype probabilites per cluster
cluster.jaccard <- all.clusters %>%
  group_by(k, dataset, ground_truth, simulated) %>%
  summarise(count=n()) %>%
  group_by(k, dataset, ground_truth) %>%
  dplyr::filter(count==max(count)) %>%
  dplyr::select(-count) %>%
  ungroup()

# Calculate jaccard index between clusters using celltype probabilities
cluster.jaccard$jaccard <- lapply(1:nrow(cluster.jaccard), function(i){
    df.temp <- all.clusters %>%
      dplyr::filter(k==cluster.jaccard$k[i] & 
               dataset==cluster.jaccard$dataset[i] &
               (ground_truth==cluster.jaccard$ground_truth[i] |
                  simulated==cluster.jaccard$simulated[i])) %>%
      mutate(ground_truth=ifelse(ground_truth==cluster.jaccard$ground_truth[i], 1, 0),
             simulated=ifelse(simulated==cluster.jaccard$simulated[i], 1, 0))

    jaccard(df.temp %>%
              pull(ground_truth), 
            df.temp %>%
              pull(simulated))

  }) %>% unlist()
# Plot jaccard index distribution per dataset and k
for (i in unique(cluster.jaccard$k)) {
  cat("##### k =", i, "\n")
  
  # Create empty plot list to append all dataset jaccard distribution plots to
  plot.list <- list()
  for (d in unique(cluster.jaccard$dataset)){
    # Subset to dataset and k of interest
    temp.data <- cluster.jaccard %>%
      dplyr::filter(k==i & dataset==d)
    # Create plot
    cluster.overlapPlot <- ggplot(temp.data, 
                                  aes(x=jaccard)) +
      geom_density(fill=pal_npg("nrc")("1")) +
      geom_vline(aes(xintercept=mean(jaccard)),
                     linetype="dashed") +
      geom_text(aes(x=mean(jaccard), label=round(mean(jaccard), 2)), 
             y=Inf, vjust=2, hjust=-2) +
      xlim(0, 1)
    
    plot.list <- append(plot.list, list(cluster.overlapPlot))
  }
  
  # Print plot
  print(plot_grid(plotlist=plot.list, nrow=1,
                  labels=unique(cluster.jaccard$dataset),
                  label_size=title.fontsize, hjust=c(-2, -1.5), vjust=1))
  cat("\n\n")
}
k = 1

k = 3