Initialize project

First, use the R package renv to create a new R environment, where all R packages are installed (for reproducibiltiy).

# Only needs to be called once, when project is started
#renv::init()

Then, load necessary libraries for downstream analysis.

library(imcRtools)
library(cytomapper)
library(tidyverse)
library(dittoSeq)
library(RColorBrewer)
library(testit)
library(viridis)
library(ggrepel)
library(EBImage)
library(scuttle)
library(mclust)
library(zellkonverter)
library(CATALYST)
library(BiocParallel)
library(tiff)

Inputs

First, we specify the input paths to the outputs of steinbock, including the paths to the TIFF images and masks created by deepcell and the output paths for all the results of this preprocessing script.

# Specify input paths
steinbock.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152/steinbock"
images.path <- paste0(steinbock.path, "/img")
masks.path <- paste0(steinbock.path, "/masks_deepcell")
sm.path <- "/mnt/bb_dqbm_volume/data/sm.rds"

# Specify output path
out.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152/preprocessed_data"
# Create output directory if id doesn't exist
if (!dir.exists(out.path)) {dir.create(out.path, recursive=TRUE)}

Using imcRtools read in tonsil data generated by the steinbock pipeline.

spe <- read_steinbock(steinbock.path)
spe
## class: SpatialExperiment 
## dim: 43 162624 
## metadata(0):
## assays(1): counts
## rownames(43): MPO Histone H3 ... Vimentin CD15
## rowData names(7): channel name ... Tube.Number Metal
## colnames: NULL
## colData names(8): sample_id ObjectNumber ... width_px height_px
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## spatialCoords names(2) : Pos_X Pos_Y
## imgData names(1): sample_id

Preprocessing

Next, we perform preprocessing according to IMCPipeline tutorial. For that we add normalized counts (asinh/log normalized) to the single cell experiments and correct for channel spillover as shown in the IMCPipeline tutorial (for both images and single cell experiments).

Single Cell experiment preprocessing

# Add colnames to the spe experiment and add shortened ROI names
colnames(spe) <- paste0(spe$sample_id, "_", spe$ObjectNumber)
spe$ROI <- as.vector(str_extract_all(spe$sample_id, "00[1-5]", simplify = TRUE))

# Plot counts before normalization
dittoRidgePlot(spe, var = "CD3", group.by = "ROI", assay = "counts") +
    ggtitle("CD3 - before transformation")
## Picking joint bandwidth of 0.118

# Normalize counts
assay(spe, "exprs") <- asinh(counts(spe)/1)
# Test log transformation
assay(spe, "log_exprs") <- log(counts(spe) + 0.1)

# Plot counts after normalization
dittoRidgePlot(spe, var = "CD3", group.by = "ROI", assay = "exprs") +
    ggtitle("CD3 - after transformation")
## Picking joint bandwidth of 0.0654

# Plot counts after (log) normalization
dittoRidgePlot(spe, var = "CD3", group.by = "ROI", assay = "log_exprs") +
    ggtitle("CD3 - after (log) transformation")
## Picking joint bandwidth of 0.0922

# Define channels of interest (so far all channels are kept)
# rowData(spe)$use_channel <- !grepl("DNA|Histone", rownames(spe))
rowData(spe)$use_channel <- rep(TRUE, length(rownames(spe)))

Next, we read in the spillover matrix and do spillover correction.

# Read in spillover matrix 
sm <- readRDS(sm.path)
# Read in isotope_list from CATALYST needed for spillover correction
isotope_list <- CATALYST::isotope_list
isotope_list$Ar <- 80

# Create column in rowData named channel name carrying channel names according to
# CATALYST standards
rowData(spe)$channel_name <- paste0(rowData(spe)$channel, "Di")

# Adapt spillover matrix to only contain isotopes used in our experiment
adapted_sm <- adaptSpillmat(sm, rowData(spe)$channel_name, 
                            isotope_list=isotope_list)
## Compensation is likely to be inaccurate.
## Spill values for the following interactions
## have not been estimated:
## La139Di -> Gd155Di
## Ir191Di -> Ir193Di
## Ir193Di -> Ir191Di, Bi209Di
# Add to our SPE
metadata(spe)$spillover_matrix <- adapted_sm

# Perform spillover correction for counts and asinh transformed counts
spe <- compCytof(spe, adapted_sm, transform=FALSE, isotope_list=isotope_list, 
                 overwrite=FALSE, cofactor=1)
spe <- compCytof(spe, adapted_sm, transform=TRUE, cofactor=1,
                 isotope_list=isotope_list, overwrite=FALSE)

Before we continue with the image preprocessing, we save the SpatialExperiment as a python Anndata object (.h5ad).

# Save RDS files
# saveRDS(spe, file.path(out.path, "spe.rds"))
# saveRDS(images, file.path(out.path, "images.rds"))
# saveRDS(masks, file.path(out.path, "masks.rds"))

# Write SpatialExperiment to Anndata for use in python
writeH5AD(SingleCellExperiment(list(counts=assay(spe, "counts"),
                                    exprs=assay(spe, "exprs"),
                                    log_exprs=assay(spe, "log_exprs"),
                                    compcounts=assay(spe, "compcounts"),
                                    compexprs=assay(spe, "compexprs")),
    colData=colData(spe),
    rowData=rowData(spe)), file=file.path(out.path, "spe.h5ad"))
## ℹ Using the 'counts' assay as the X matrix

Image preprocessing

Next, we load the images and masks generated by steinbock and do the image preprocessing by correcting for channel spillover.

# Load TIFF images and masks
images <- loadImages(images.path)
## All files in the provided location will be read in.
masks <- loadImages(masks.path, as.is = TRUE)
## All files in the provided location will be read in.
# Add channel names to the images
channelNames(images) <- rownames(spe)
images
## CytoImageList containing 5 image(s)
## names(5): 20220520_TsH_th152_cisi1_001 20220520_TsH_th152_cisi1_002 20220520_TsH_th152_cisi1_003 20220520_TsH_th152_cisi1_004 20220520_TsH_th152_cisi1_005 
## Each image contains 43 channel(s)
## channelNames(43): MPO Histone H3 SMA E-Cad/P-Cad CCR7 CD38 HLA-DR CD27 CD303 CD68 CD45RA CD20 CD11b CD56 GranzymeB CD3 DC-LAMP CD11c PD-1 GITR MMP9 CD14 PD-L1 TCF1/TCF7 CD45RO FOXP3 ICOS Ki-67 CD8a Tim-3 IRF4 VISTA CD1c CD4 CD31 CXCL12 CCL21 panCK CXCL13 Ir191 Ir193 Vimentin CD15
# Check that images and masks have the same name and are in the same order
# for downstream analysis
assert("Check that images and masks have the same name and are in the same order.", 
       all.equal(names(images), names(masks)))

# Add metadata to the images
ROI <- str_extract_all(names(images), "00[1-5]", simplify = TRUE)
mcols(images) <- mcols(masks) <- DataFrame(sample_id = names(images),
                                           ROI = ROI)

Next, we compute spillover correction on images.

# Change channel names to match CATALYST 
channelNames(images) <- rowData(spe)$channel_name

# Compute channel spillover on images
images_comp <- compImage(images, adapted_sm, BPPARAM=MulticoreParam())
# Rename channels to original names
channelNames(images_comp) <- rownames(spe)

And we save the corrected images in the “comp_img” folder.

## Warning in dir.create(file.path(out.path, "comp_img")): '/mnt/bb_dqbm_volume/
## data/Tonsil_th152/preprocessed_data/comp_img' already exists
## [[1]]
## [1] "/mnt/bb_dqbm_volume/data/Tonsil_th152/preprocessed_data/comp_img/20220520_TsH_th152_cisi1_001.tiff"
## 
## [[2]]
## [1] "/mnt/bb_dqbm_volume/data/Tonsil_th152/preprocessed_data/comp_img/20220520_TsH_th152_cisi1_002.tiff"
## 
## [[3]]
## [1] "/mnt/bb_dqbm_volume/data/Tonsil_th152/preprocessed_data/comp_img/20220520_TsH_th152_cisi1_003.tiff"
## 
## [[4]]
## [1] "/mnt/bb_dqbm_volume/data/Tonsil_th152/preprocessed_data/comp_img/20220520_TsH_th152_cisi1_004.tiff"
## 
## [[5]]
## [1] "/mnt/bb_dqbm_volume/data/Tonsil_th152/preprocessed_data/comp_img/20220520_TsH_th152_cisi1_005.tiff"

Quality Control (code not run in HTML)

The next part will go over some QC as shown in the IMCPipeline tutorial. This code will not be run in the HTML output script since it creates a lot of plots, which should be interactively looked at once before continuing with any further analysis.

Colour scheme

First, we define a colour scheme to be used in plots.

# Create list which contains colour schemes for ROI and sample_id (the same in
# our case)
color_vectors <- list()

# Create named colour schemes
ROI <- setNames(brewer.pal(length(unique(spe$ROI)), name = "BrBG"), 
                unique(spe$ROI))
sample_id <- setNames(dittoColors(reps = 1)[seq_along(unique(spe$sample_id))], 
                unique(spe$sample_id))

# Add colour schemes to empty color_vectors list
color_vectors$ROI <- ROI
color_vectors$sample_id <- sample_id

# Add colour scheme as metadata of our spe experiment
metadata(spe)$color_vectors <- color_vectors

Image QC

Next, we perform image- and channel-wise normalization and do some QC checks on the images.

# Set a seed for reproducibility
set.seed(20220118)

# Get image indexes (originally only some of the images are selected, but here we
# do select all of them )
# img_ids <- sample(seq_len(length(images)), 3)
img_ids <- seq_len(length(images))

# Normalize and clip images
cur_images <- images[img_ids]
cur_images <- normalize(cur_images, separateImages = TRUE)
cur_images <- normalize(cur_images, inputRange = c(0, 0.2))

# Overlay segmentation masks on images and displaying channels that were used 
# for segmentation (only for the first image here) for a QC check
plotPixels(cur_images[1],
           mask = masks[1],
           img_id = "sample_id",
           missing_colour = "white",
           colour_by = c("CD20", "CD3", "E-Cad/P-Cad", "Ir191"),
           colour = list(CD20 = c("black", "red"),
                         CD3 = c("black", "green"),
                         "E-Cad/P-Cad" = c("black", "cyan"),
                         Ir191 = c("black", "blue")),
           image_title = NULL,
           legend = list(colour_by.title.cex = 0.7,
                         colour_by.labels.cex = 0.7))

# Sample 2000 cells for another QC check
cur_cells <- sample(seq_len(ncol(spe)), 2000)

# Visualize single-cell expression in form of a heatmap for another QC check
dittoHeatmap(spe[, cur_cells], genes = rownames(spe)[rowData(spe)$use_channel],
             assay = "exprs", cluster_cols = TRUE, scale = "none",
             heatmap.colors = viridis(100), 
             annot.by = "ROI",
             annotation_colors = list(ROI = metadata(spe)$color_vectors$ROI)
             )

Next, we perform image level QC by looking at the signal-to-noise ratio (SNR) for individual channels and markers, the image area covered by cells and mean marker expression per ROI.

# Calculate signal-to-noise ratio using the otsu thresholding approach
cur_snr <- lapply(images, function(img){
    mat <- apply(img, 3, function(ch){
        # Otsu threshold
        thres <- otsu(ch, range = c(min(ch), max(ch)))
        # Signal-to-noise ratio
        snr <- mean(ch[ch > thres]) / mean(ch[ch <= thres])
        # Signal intensity
        ps <- mean(ch[ch > thres])
        
        return(c(snr = snr, ps = ps))
    })
    t(mat) %>% as.data.frame() %>% 
        mutate(marker = colnames(mat)) %>% 
        pivot_longer(cols = c(snr, ps))
})

cur_snr <- do.call(rbind, cur_snr)

# Plot signal-to-noise-ratio vs signal intensity
cur_snr %>% 
    group_by(marker, name) %>%
    summarize(mean = mean(value),
              ci = qnorm(0.975)*sd(value)/sqrt(n())) %>%
    pivot_wider(names_from = name, values_from = c(mean, ci)) %>%
    ggplot() +
#    geom_errorbar(aes(y = log2(mean_snr), xmin = log2(mean_ps - ci_ps), 
#                      xmax = log2(mean_ps + ci_ps))) +
#    geom_errorbar(aes(x = log2(mean_ps), ymin = log2(mean_snr - ci_snr), 
#                      ymax = log2(mean_snr + ci_snr))) +
    geom_point(aes(log2(mean_ps), log2(mean_snr))) +
    geom_label_repel(aes(log2(mean_ps), log2(mean_snr), label = marker), max.overlaps=Inf) +
    theme_minimal(base_size = 15) + ylab("Signal-to-noise ratio") +
    xlab("Signal intensity")
# Plot image area covered by cells
colData(spe) %>%
    as.data.frame() %>%
    group_by(sample_id) %>%
    summarize(cell_area = sum(area),
           no_pixels = mean(width_px) * mean(height_px)) %>%
    mutate(covered_area = cell_area / no_pixels) %>%
    ggplot() +
        geom_point(aes(sample_id, covered_area)) + 
        theme_minimal(base_size = 15) +
        ylim(c(0, 1)) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
        ylab("% covered area") + xlab("")
# Compute image/ROI mean and transform intensities
image_mean <- aggregateAcrossCells(spe, 
                                   ids = spe$sample_id, 
                                   statistics="mean",
                                   use.assay.type = "counts")
assay(image_mean, "exprs") <- asinh(counts(image_mean))

# Plot mean marker expression per ROI
dittoHeatmap(image_mean, genes = rownames(spe)[rowData(spe)$use_channel],
             assay = "exprs", cluster_cols = TRUE, scale = "none",
             heatmap.colors = viridis(100), 
             annot.by = c("ROI"),
             annotation_colors = list(ROI = metadata(spe)$color_vectors$ROI),
             show_colnames = TRUE)

Singel Cell QC

Now, we perform single cell level QC by looking at the signal-to-noise ratio (SNR) for individual channels and markers, cell sizes across images/ROIs, image area covered by cells, check for potential staining differences and mean marker expression per ROI.

# Set seed for reproducibility
set.seed(220224)

# To assess the snr, a two component Gaussian mixture model for each marker 
# is derived to find cells with positive and negative expression (signal/noise)
mat <- apply(assay(spe, "exprs"), 1, function(x){
    cur_model <- Mclust(x, G = 2)
    mean1 <- mean(x[cur_model$classification == 1])
    mean2 <- mean(x[cur_model$classification == 2])
    
    signal <- ifelse(mean1 > mean2, mean1, mean2)
    noise <- ifelse(mean1 > mean2, mean2, mean1)
    
    return(c(snr = signal/noise, ps = signal))
})
    
cur_snr <- t(mat) %>% as.data.frame() %>% 
        mutate(marker = colnames(mat))

# Plot signal-to-noise ratio vs signal intensity
cur_snr %>% ggplot() +
    geom_point(aes(log2(ps), log2(snr))) +
    geom_label_repel(aes(log2(ps), log2(snr), label = marker)) +
    theme_minimal(base_size = 15) + ylab("Signal-to-noise ratio") +
    xlab("Signal intensity")
# Plot distribution of cell sizes across images
colData(spe) %>%
    as.data.frame() %>%
    group_by(sample_id) %>%
    ggplot() +
        geom_boxplot(aes(sample_id, area)) +
        theme_minimal(base_size = 15) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
        ylab("Cell area") + xlab("")
# Print summary of cell areas
summary(spe$area)

# Remove very small cells
# sum(spe$area < 5)
# spe <- spe[, spe$area >= 5]
# Plot percentage of image area covered by cells
colData(spe) %>%
    as.data.frame() %>%
    group_by(sample_id) %>%
    summarize(cell_count = n(),
           no_pixels = mean(width_px) * mean(height_px)) %>%
    mutate(cells_per_mm2 = cell_count/(no_pixels/1000000)) %>%
    ggplot() +
        geom_point(aes(sample_id, cells_per_mm2)) + 
        theme_minimal(base_size = 15)  + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
        ylab("Cells per mm2") + xlab("")
# Check for potential staining differences using ridgeline visualizations
multi_dittoPlot(spe, vars = rownames(spe)[rowData(spe)$use_channel],
               group.by = "ROI", plots = c("ridgeplot"), 
               assay = "exprs", 
               color.panel = metadata(spe)$color_vectors$ROI)