Preprocess Tonsil th179

Initialize project

First, use the R package renv to create a new R environment, where all R packages are installed (for reproducibiltiy).

# Only needs to be called once, when project is started
#renv::init()

Then, load necessary libraries for downstream analysis.

library(imcRtools)
library(cytomapper)
library(tidyverse)
library(dittoSeq)
library(RColorBrewer)
library(testit)
library(viridis)
library(ggrepel)
library(EBImage)
library(scuttle)
library(mclust)
library(zellkonverter)
library(CATALYST)
library(BiocParallel)
library(tiff)

Inputs

First, we specify the input paths to the outputs of steinbock, including the paths to the TIFF images and masks created by deepcell and the output paths for all the results of this preprocessing script.

# Specify input paths
# steinbock.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152/steinbock"
# steinbock.path <- "/mnt/bb_dqbm_volume/data/20221108_TsH_LSS_cisiabmix2_179/steinbock"
steinbock.path <- "/mnt/bb_dqbm_volume/data/20221108_TsH_LSS_cisiabmix1_179/steinbock"
images.path <- paste0(steinbock.path, "/img")
masks.path <- paste0(steinbock.path, "/masks_deepcell")
sm.path <- "/mnt/bb_dqbm_volume/data/sm.rds"

# Specify output path
out.path <- "/mnt/bb_dqbm_volume/data/20221108_TsH_LSS_cisiabmix1_179/preprocessed_data"
# Create output directory if id doesn't exist
if (!dir.exists(out.path)) {dir.create(out.path, recursive=TRUE)}

## Specify if spillover correction should be applied
correct_spillover <- FALSE

Using imcRtools read in data generated by the steinbock pipeline.

spe <- read_steinbock(steinbock.path)
spe

## class: SpatialExperiment 
## dim: 14 87005 
## metadata(0):
## assays(1): counts
## rownames(14): SMA CD68 ... Ir193 Ki-67
## rowData names(5): channel name keep ilastik deepcell
## colnames: NULL
## colData names(8): sample_id ObjectNumber ... width_px height_px
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## spatialCoords names(2) : Pos_X Pos_Y
## imgData names(1): sample_id

Preprocessing

Next, we perform preprocessing according to IMCPipeline tutorial. For that we add normalized counts (asinh/log normalized) to the single cell experiments and correct for channel spillover if parameter is set as shown in the IMCPipeline tutorial (for both images and single cell experiments).

Single Cell Experiment Preprocessing

# Add colnames to the spe experiment and add shortened ROI names
colnames(spe) <- paste0(spe$sample_id, "_", spe$ObjectNumber)
spe$ROI <- as.vector(str_extract_all(spe$sample_id, "00[1-5]", simplify=TRUE))

# Plot counts before normalization
dittoRidgePlot(spe, var="CD3", group.by="ROI", assay="counts") +
    ggtitle("CD3 - before transformation")

## Picking joint bandwidth of 0.0434

# Normalize counts
assay(spe, "exprs") <- asinh(counts(spe)/1)
# Test log transformation
assay(spe, "log_exprs") <- log(counts(spe) + 0.1)

# Plot counts after normalization
dittoRidgePlot(spe, var="CD3", group.by="ROI", assay="exprs") +
    ggtitle("CD3 - after transformation")

## Picking joint bandwidth of 0.036

# Plot counts after (log) normalization
dittoRidgePlot(spe, var="CD3", group.by="ROI", assay="log_exprs") +
    ggtitle("CD3 - after (log) transformation")

## Picking joint bandwidth of 0.0711

# Define channels of interest (so far all channels are kept)
# rowData(spe)$use_channel <- !grepl("DNA|Histone", rownames(spe))
rowData(spe)$use_channel <- rep(TRUE, length(rownames(spe)))

Next, if parameter correct_spillover is set to TRUE, we read in the spillover matrix and do spillover correction.

if (correct_spillover){
  # Read in spillover matrix 
  sm <- readRDS(sm.path)
  # Read in isotope_list from CATALYST needed for spillover correction
  isotope_list <- CATALYST::isotope_list
  isotope_list$Ar <- 80
  
  # Create column in rowData named channel name carrying channel names according to
  # CATALYST standards
  rowData(spe)$channel_name <- paste0(rowData(spe)$channel, "Di")
  
  # Adapt spillover matrix to only contain isotopes used in our experiment
  adapted_sm <- adaptSpillmat(sm, rowData(spe)$channel_name, 
                              isotope_list=isotope_list)
  # Add to our SPE
  metadata(spe)$spillover_matrix <- adapted_sm
  
  # Perform spillover correction for counts and asinh transformed counts
  spe <- compCytof(spe, adapted_sm, transform=FALSE, isotope_list=isotope_list, 
                   overwrite=FALSE, cofactor=1)
  spe <- compCytof(spe, adapted_sm, transform=TRUE, cofactor=1,
                   isotope_list=isotope_list, overwrite=FALSE)
}

Before we continue with the image preprocessing, we save the SpatialExperiment as a python Anndata object (.h5ad).

# Save RDS files
# saveRDS(spe, file.path(out.path, "spe.rds"))
# saveRDS(images, file.path(out.path, "images.rds"))
# saveRDS(masks, file.path(out.path, "masks.rds"))

if (correct_spillover){
  # Write SpatialExperiment to Anndata for use in python
  writeH5AD(SingleCellExperiment(list(counts=assay(spe, "counts"),
                                      exprs=assay(spe, "exprs"),
                                      log_exprs=assay(spe, "log_exprs"),
                                      compcounts=assay(spe, "compcounts"),
                                      compexprs=assay(spe, "compexprs")),
                                 colData=colData(spe),
                                 rowData=rowData(spe)), file=file.path(out.path, "spe.h5ad"))
} else {
  # Write SpatialExperiment to Anndata for use in python
  writeH5AD(SingleCellExperiment(list(counts=assay(spe, "counts"),
                                      exprs=assay(spe, "exprs"),
                                      log_exprs=assay(spe, "log_exprs")),
                                 colData=colData(spe),
                                 rowData=rowData(spe)), file=file.path(out.path, "composite_measurements.h5ad"))
}

## ℹ Using the 'counts' assay as the X matrix

Image preprocessing

Next, we load the images and masks generated by steinbock and do the image preprocessing by correcting for channel spillover if the corresponding parameter is set.

# Load TIFF images and masks
images <- loadImages(images.path)

## All files in the provided location will be read in.

masks <- loadImages(masks.path, as.is=TRUE)

## All files in the provided location will be read in.

# Add channel names to the images
channelNames(images) <- rownames(spe)
images

## CytoImageList containing 5 image(s)
## names(5): 20221108_TsH_LSS_cisiabmix1_179_001 20221108_TsH_LSS_cisiabmix1_179_002 20221108_TsH_LSS_cisiabmix1_179_003 20221108_TsH_LSS_cisiabmix1_179_004 20221108_TsH_LSS_cisiabmix1_179_005 
## Each image contains 14 channel(s)
## channelNames(14): SMA CD68 CD3 CD20 CC0_SMA_CD68_CD3 CC1_CD11c CD45RO CC2_CD68_CD20_CD45RO CC3_CD8a_Ki-67 CD8a CD11c Ir191 Ir193 Ki-67

# Check that images and masks have the same name and are in the same order
# for downstream analysis
assert("Check that images and masks have the same name and are in the same order.", 
       all.equal(names(images), names(masks)))

# Add metadata to the images
ROI <- str_extract_all(names(images), "00[1-5]", simplify=TRUE)
mcols(images) <- mcols(masks) <- DataFrame(sample_id=names(images),
                                           ROI=ROI)

Next, we compute spillover correction on images.

if (correct_spillover){
  # Change channel names to match CATALYST 
  channelNames(images) <- rowData(spe)$channel_name
  
  # Compute channel spillover on images
  images_comp <- compImage(images, adapted_sm, BPPARAM=MulticoreParam())
  # Rename channels to original names
  channelNames(images_comp) <- rownames(spe)
}

And we save the corrected images in the “comp_img” folder (again only if corresponding parameter is set).

if (correct_spillover){
  # Create output directory for images
  dir.create(file.path(out.path, "comp_img"))
  
  # Save spillover corrected images to above defined output directory
  lapply(names(images_comp), function(x){
    writeImage(as.array(images_comp[[x]])/(2^16 - 1), 
               paste0(out.path, "/comp_img/", x, ".tiff"),
               bits.per.sample=16)
  })
}

Quality Control (code not run in HTML)

The next part will go over some QC as shown in the IMCPipeline tutorial. This code will not be run in the HTML output script since it creates a lot of plots, which should be interactively looked at once before continuing with any further analysis.

Colour scheme

First, we define a colour scheme to be used in plots.

# Create list which contains colour schemes for ROI and sample_id (the same in
# our case)
color_vectors <- list()

# Create named colour schemes
ROI <- setNames(brewer.pal(length(unique(spe$ROI)), name="BrBG"), 
                unique(spe$ROI))
sample_id <- setNames(dittoColors(reps=1)[seq_along(unique(spe$sample_id))], 
                unique(spe$sample_id))

# Add colour schemes to empty color_vectors list
color_vectors$ROI <- ROI
color_vectors$sample_id <- sample_id

# Add colour scheme as metadata of our spe experiment
metadata(spe)$color_vectors <- color_vectors

Image QC

Next, we perform image- and channel-wise normalization and do some QC checks on the images.

# Set a seed for reproducibility
set.seed(20220118)

# Get image indexes (originally only some of the images are selected, but here we
# do select all of them )
# img_ids <- sample(seq_len(length(images)), 3)
img_ids <- seq_len(length(images))

# Normalize and clip images
cur_images <- images[img_ids]
cur_images <- normalize(cur_images, separateImages=TRUE)
cur_images <- normalize(cur_images, inputRange=c(0, 0.2))

# Overlay segmentation masks on images and displaying channels that were used 
# for segmentation (only for the first image here) for a QC check
plotPixels(cur_images,
           mask=masks,
           img_id="sample_id",
           missing_colour="white",
           colour_by=c("CD20", "Ki-67", "SMA", "191Ir"),
           colour=list(CD20=c("black", "red"),
                       "Ki-67"=c("black", "green"),
                       SMA=c("black", "cyan"),
                       "191Ir"=c("black", "blue")),
           image_title=NULL,
           legend=list(colour_by.title.cex=0.7,
                       colour_by.labels.cex=0.7))

# Sample 2000 cells for another QC check
cur_cells <- sample(seq_len(ncol(spe)), 2000)

# Visualize single-cell expression in form of a heatmap for another QC check
dittoHeatmap(spe[, cur_cells], genes=rownames(spe)[rowData(spe)$use_channel],
             assay="exprs", cluster_cols=TRUE, scale="none",
             heatmap.colors=viridis(100), 
             annot.by="ROI",
             annotation_colors=list(ROI=metadata(spe)$color_vectors$ROI)
             )

Next, we perform image level QC by looking at the signal-to-noise ratio (SNR) for individual channels and markers, the image area covered by cells and mean marker expression per ROI.

# Calculate signal-to-noise ratio using the otsu thresholding approach
cur_snr <- lapply(images, function(img){
    mat <- apply(img, 3, function(ch){
        # Otsu threshold
        thres <- otsu(ch, range=c(min(ch), max(ch)))
        # Signal-to-noise ratio
        snr <- mean(ch[ch > thres]) / mean(ch[ch <= thres])
        # Signal intensity
        ps <- mean(ch[ch > thres])
        
        return(c(snr=snr, ps=ps))
    })
    t(mat) %>% as.data.frame() %>% 
        mutate(marker=colnames(mat)) %>% 
        pivot_longer(cols=c(snr, ps))
})

cur_snr <- do.call(rbind, cur_snr)

# Plot signal-to-noise-ratio vs signal intensity
cur_snr %>% 
    group_by(marker, name) %>%
    summarize(mean=mean(value),
              ci=qnorm(0.975)*sd(value)/sqrt(n())) %>%
    pivot_wider(names_from=name, values_from=c(mean, ci)) %>%
    ggplot() +
#    geom_errorbar(aes(y=log2(mean_snr), xmin=log2(mean_ps - ci_ps), 
#                      xmax=log2(mean_ps + ci_ps))) +
#    geom_errorbar(aes(x=log2(mean_ps), ymin=log2(mean_snr - ci_snr), 
#                      ymax=log2(mean_snr + ci_snr))) +
    geom_point(aes(log2(mean_ps), log2(mean_snr))) +
    geom_label_repel(aes(log2(mean_ps), log2(mean_snr), label=marker), max.overlaps=Inf) +
    theme_minimal(base_size=15) + ylab("Signal-to-noise ratio") +
    xlab("Signal intensity")

# Plot image area covered by cells
colData(spe) %>%
    as.data.frame() %>%
    group_by(sample_id) %>%
    summarize(cell_area=sum(area),
           no_pixels=mean(width_px) * mean(height_px)) %>%
    mutate(covered_area=cell_area / no_pixels) %>%
    ggplot() +
        geom_point(aes(sample_id, covered_area)) + 
        theme_minimal(base_size=15) +
        ylim(c(0, 1)) + 
        theme(axis.text.x=element_text(angle=90, hjust=1, size=8)) +
        ylab("% covered area") + xlab("")

# Compute image/ROI mean and transform intensities
image_mean <- aggregateAcrossCells(spe, 
                                   ids=spe$sample_id, 
                                   statistics="mean",
                                   use.assay.type="counts")
assay(image_mean, "exprs") <- asinh(counts(image_mean))

# Plot mean marker expression per ROI
dittoHeatmap(image_mean, genes=rownames(spe)[rowData(spe)$use_channel],
             assay="exprs", cluster_cols=TRUE, scale="none",
             heatmap.colors=viridis(100), 
             annot.by=c("ROI"),
             annotation_colors=list(ROI=metadata(spe)$color_vectors$ROI),
             show_colnames=TRUE)

Singel Cell QC

Now, we perform single cell level QC by looking at the signal-to-noise ratio (SNR) for individual channels and markers, cell sizes across images/ROIs, image area covered by cells, check for potential staining differences and mean marker expression per ROI.

# Set seed for reproducibility
set.seed(220224)

# To assess the snr, a two component Gaussian mixture model for each marker 
# is derived to find cells with positive and negative expression (signal/noise)
mat <- apply(assay(spe, "exprs"), 1, function(x){
    cur_model <- Mclust(x, G=2)
    mean1 <- mean(x[cur_model$classification == 1])
    mean2 <- mean(x[cur_model$classification == 2])
    
    signal <- ifelse(mean1 > mean2, mean1, mean2)
    noise <- ifelse(mean1 > mean2, mean2, mean1)
    
    return(c(snr=signal/noise, ps=signal))
})
    
cur_snr <- t(mat) %>% as.data.frame() %>% 
        mutate(marker=colnames(mat))

# Plot signal-to-noise ratio vs signal intensity
cur_snr %>% ggplot() +
    geom_point(aes(log2(ps), log2(snr))) +
    geom_label_repel(aes(log2(ps), log2(snr), label=marker)) +
    theme_minimal(base_size=15) + ylab("Signal-to-noise ratio") +
    xlab("Signal intensity")

# Plot distribution of cell sizes across images
colData(spe) %>%
    as.data.frame() %>%
    group_by(sample_id) %>%
    ggplot() +
        geom_boxplot(aes(sample_id, area)) +
        theme_minimal(base_size=15) + 
        theme(axis.text.x=element_text(angle=90, hjust=1, size=8)) +
        ylab("Cell area") + xlab("")

# Print summary of cell areas
summary(spe$area)

# Remove very small cells
# sum(spe$area < 5)
# spe <- spe[, spe$area >= 5]

# Plot percentage of image area covered by cells
colData(spe) %>%
    as.data.frame() %>%
    group_by(sample_id) %>%
    summarize(cell_count=n(),
           no_pixels=mean(width_px) * mean(height_px)) %>%
    mutate(cells_per_mm2=cell_count/(no_pixels/1000000)) %>%
    ggplot() +
        geom_point(aes(sample_id, cells_per_mm2)) + 
        theme_minimal(base_size=15)  + 
        theme(axis.text.x=element_text(angle=90, hjust=1, size=8)) +
        ylab("Cells per mm2") + xlab("")

# Check for potential staining differences using ridgeline visualizations
multi_dittoPlot(spe, vars=rownames(spe)[rowData(spe)$use_channel],
               group.by="ROI", plots=c("ridgeplot"), 
               assay="exprs", 
               color.panel=metadata(spe)$color_vectors$ROI,
               ncol=5)