knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(warning=FALSE, message=FALSE)
# Load libraries
library(RColorBrewer)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis/Immucan_lung/training/full"
## Input data
data.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Set path to masks
masks.path <- file.path(data.path, "masks")
# Set path to pre-trained random forest classifier
rf.path <- file.path(data.path, "rffit_v7_all_markers.rds")
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")
## Read results
# Read in all results for lung data into one dataframe
results.files <- list.files(analysis.path, 'simulation_results.txt',
full.names=TRUE, recursive=TRUE)
results.files <- results.files[grepl("normalized", results.files)]
results.df <- lapply(results.files, read_results, type="res", voi="norm") %>%
bind_rows() %>%
mutate(norm=ifelse(grepl("combi", norm), "combi", norm))
## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
X.files <- X.files[grepl("normalized", X.files)]
# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, if data was normalized before
# training and if it is the ground truth or simulated X)
X.list <- lapply(X.files, read_results, type="x", voi="norm")
X.list <- lapply(X.list, function(sce.temp){
if (grepl("combi", metadata(sce.temp)$norm)){
metadata(sce.temp)$norm <- "combi"
}
assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
sce.temp
})
# Read in masks for lung data
masks <- loadImages(masks.path, as.is=TRUE)
mcols(masks) <- DataFrame(sample_id=names(masks))
## Read random forest classifier
# Read RF classifier for the lung dataset
rffit.lung <- readRDS(rf.path)
For each different measurement used to test training results, a barplot is shown comparing CISI results using the original normalization across cells (used in training, simulating and to what it is compared to) and not using it at all.
# For each different measurement of training results, plot barplot comparing
# normalized und unnormalized results
# Melt dataframe for plotting
data_temp <- results.df %>%
dplyr::select(-c("dataset", "training", "datasize", "version", "Best crossvalidation fold")) %>%
pivot_longer(!c("norm", "simulation"), names_to="measure", values_to="value")
# Create barplot
results.barplot <- plot_cisi_results(data_temp, "measure", "value", "norm")
print(results.barplot)
The following plots show scatterplots of different proteins of interest where cells are coloured according to celltypes of interest for Immucan lung dataset once trained on normalized and once trained on unnormalized datasets.
names(X.list) <- lapply(X.list, function(x){metadata(x)$ground_truth})
X.list <- append(X.list[-c(1:2)], X.list[1:2])
# List of combinations of proteins of interest
poi <- list(c("CD20", "CD3", "B"), c("VISTA", "CD3", ""))
for (p_vec in poi) {
cat("###", p_vec[1], "vs", p_vec[2], "\n")
# Plot arcsinh transformed counts of proteins of interest of decomposed data trained
# with and without normalization
X.spillover <- plot_grid(plot_exprs(X.list[1:2], p_vec[3], p_vec[1], p_vec[2]),
plot_exprs(X.list[3:4], p_vec[3], p_vec[1], p_vec[2]),
plot_exprs(X.list[5:6], p_vec[3], p_vec[1], p_vec[2]),
ncol=1, labels=unique(unlist(lapply(X.list, function(sce){
metadata(sce)$norm}))),
label_size=title.fontsize, hjust=c(-4.5, -3, -7.5),
vjust=0.9, rel_widths = c(1, 2)) #, scale=0.9)
print(X.spillover)
cat("\n\n")
}
Next, we use the gating labels of a pre-trained random forest classifier for the Immucan lung dataset to newly train a random forest classifier on our decomposed arcsinh transformed data and predict all celltypes.
We then compare these results to the ‘ground truth’ celltypes (celltypes predicted from ground truth data) by calculating the precision, sensitivity and specififity.
## Fix colours for plotting
celltype.col <- c(brewer.pal(12, "Paired"),
brewer.pal(8, "Pastel2")[-c(3,5,8)],
brewer.pal(12, "Set3")[-c(2,3,8,9,11,12)])
celltype.col <- c(celltype.col[1:length(unique(colData(X.list[[1]])$celltype))],
"black")
names(celltype.col) <- c(levels(colData(X.list[[1]])$celltype), NA)
# Predict cell type using pretrained classifier on decomposed results and
# compute confusion matrix using celltypes of true data
# Extract decomposted datasets
lungSim.list <- keep(X.list, function(x){
metadata(x)$ground_truth=="simulated"})
# Train RF classifier
# rffit.sim <- lapply(lungSim.list,
# train_rf, rffit.original=rffit.lung)
# saveRDS(rffit.sim, file.path(analysis.path, "rffit_sim_data_norm.rds"))
rffit.sim <- readRDS(file.path(analysis.path, "rffit_sim_data_norm.rds"))
# Calculate confusion matrix
lungSim.cm <- lapply(as.list(1:length(lungSim.list)), function(i){
compute_confusionMatrix(lungSim.list[[i]], rffit.sim[[i]])})
# Plot prediction results/confusion matrix as shown in IMCDataAnalysis script
i <- 1
for (cm in lungSim.cm){
cat("###", metadata(lungSim.list[[i]])$norm, "\n")
pred.plot <- data.frame(cm$byClass) %>%
mutate(class=sub("Class: ", "", rownames(cm$byClass))) %>%
ggplot() +
geom_point(aes(1 - Specificity, Sensitivity,
size=Detection.Rate,
fill=class),
shape=21) +
scale_fill_manual(values=celltype.col) +
theme_cowplot(title.fontsize) +
ylab("Sensitivity (TPR)") +
xlab("1 - Specificity (FPR)")
i <- i+1
print(pred.plot)
cat("\n\n")
}
Next, we have a look at which celltypes the RF predicts for wrongly identified cells using the classification probabilities per celltype class.
# Predict celltype probabilities
lungSim.prob <- lapply(as.list(1:length(lungSim.list)), function(i){
compute_celltypeProb(lungSim.list[[i]], rffit.sim[[i]])})
# Plot prediction results/confusion matrix as shown in IMCDataAnalysis script
i <- 1
for (p in lungSim.prob){
cat("###", metadata(lungSim.list[[i]])$norm, "\n")
pred_prob.plot <- p %>%
pivot_longer(cols=B:Tumor_Ki67) %>%
ggplot() +
geom_boxplot(aes(x=name, y=value, fill=name), outlier.size=0.5) +
facet_wrap(.~truth, ncol=1) +
scale_fill_manual(values=celltype.col) +
scale_x_discrete(guide=guide_axis(n.dodge=2)) +
theme(panel.background=element_blank(),
axis.text.x=element_text(angle=45, hjust=1)) +
theme_cowplot(title.fontsize) +
labs(x="probability", y="celltype", fill="celltype")
i <- i+1
print(pred_prob.plot)
cat("\n\n")
}
We also have a look at images coloured by celltypes identified using the ground truth and decomposed/simulated data.
lungSim.list <- lapply(as.list(1:length(lungSim.list)), function(i){
sce <- lungSim.list[[i]]
cell_class <- data.frame(cell_id=rownames(lungSim.prob[[i]]),
celltype_pred=factor(colnames(lungSim.prob[[i]])[max.col(
lungSim.prob[[i]] %>%
dplyr::select(-truth), ties.method="first")]))
cell_class[rowMax(as.matrix(lungSim.prob[[i]] %>%
dplyr::select(-truth))) < 0.4, "celltype_pred"] <- "uncertain"
# Store labels in SpatialExperiment onject
colData(sce) <- merge(colData(sce), cell_class, all.x=TRUE)
colData(sce)$celltype_NA <- colData(sce)$celltype
colData(sce)$celltype_NA[is.na(colData(sce)$celltype_pred)] <- NA
sce
})
# Specify image index of interest
im <- 1
# Print image im of ground truth celltypes vs decomposed celltypes
for (sce in lungSim.list){
cat("####", metadata(sce)$norm, "\n")
images <- plot_celltype_images(sce, masks, im, celltype.col)
print(images)
cat("\n\n")
}
# Specify image index of interest
im <- 10
# Print image im of ground truth celltypes vs decomposed celltypes
for (sce in lungSim.list){
cat("####", metadata(sce)$norm, "\n")
images.2 <- plot_celltype_images(sce, masks, im, celltype.col)
print(images.2)
cat("\n\n")
}
# Specify image index of interest
im <- 100
# Print image im of ground truth celltypes vs decomposed celltypes
for (sce in lungSim.list){
cat("####", metadata(sce)$norm, "\n")
images.3 <- plot_celltype_images(sce, masks, im, celltype.col)
print(images.3)
cat("\n\n")
}