Quantify gene imputation accuracy

Gene Expression Evaluation

The module also provides gene expression evaluation metrics with rasterization support for both 2D and 3D data.

Gene Expression Metrics

Masked MSE/MAE: Mean Squared/Absolute Error on regions with expression
Cosine Similarity: Mean cosine similarity (masked and unmasked)
Spearman Correlation: Mean Spearman correlation (masked and unmasked)
Pearson Correlation: Mean Pearson correlation (masked and unmasked)
IoU Mask: Intersection over Union of expression masks

import unist
import scanpy as sc

2D

## read in data

adata_path = '/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/SUICA/data/preprocessed_data/slice_440.h5ad'
gt = sc.read(adata_path)

gt

AnnData object with n_obs × n_vars = 8382 × 17649
    obs: 'area', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts'
    obsm: 'X_pca', 'bbox', 'spatial'

This result was obtained from Gene Imputation Module:

https://unist-tutorial.readthedocs.io/en/latest/gene_imputation.html

res = sc.read('/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/SUICA/data/preprocessed_data/reconstructed-original.h5ad')
res

AnnData object with n_obs × n_vars = 16764 × 1
    obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'

### put the gene expression data into res.X

import anndata as ad
from scipy import sparse

res_new = ad.AnnData(
    X=sparse.csr_matrix(res.obsm["reconstructed_raw"]),
    obs=res.obs.copy(),
    obsm=res.obsm.copy()
)
res_new.var_names = gt.var_names.copy()

res_new

AnnData object with n_obs × n_vars = 16764 × 17649
    obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'

from unist.metrics import evaluate_gene_expression_2d

10 micronmeters resolution

All genes

metrics_2d = evaluate_gene_expression_2d(
    gt, res_new,
    bin_size=20.0, # corresponding to 10 micronmeters resolution
    true_coords_key="spatial",
    pred_coords_key="spatial"
)

print(f"MSE (masked): {metrics_2d['MSE_masked']:.4f}")
print(f"Pearson (masked): {metrics_2d['Pearson_masked']:.4f}")

MSE (masked): 3.8159
Pearson (masked): 0.2770

metrics_2d

{'bins': 33741,
 'genes': 17649,
 'bin_size': 20.0,
 'IoU_mask': 0.3986876827615719,
 'MSE_masked': 3.8158515045623447,
 'MAE_masked': 1.234025159761077,
 'Cosine_unmasked': 0.19846487198672683,
 'Cosine_masked': 0.6738546518423421,
 'Spearman_unmasked': 0.1768471134755618,
 'Spearman_masked': 0.23622424796329358,
 'Pearson_unmasked': 0.18163100396300597,
 'Pearson_masked': 0.27697253250828124}

Specific genes

from unist.metrics import evaluate_specific_genes_2d

result = evaluate_specific_genes_2d(
    gt,
    res_new,
    gene_names=["Myl2", "Myl7", "Tnnt2"],   # or just "Myl2"
    bin_size=20.0,
    true_coords_key="spatial",
    pred_coords_key="spatial",
)

# overall metrics（for all designated genes）
overall = result["overall"]
print(overall["MAE_masked"], overall["Cosine_masked"], overall["Spearman_masked"])

# per_gene
if "per_gene" in result:
    for gene, metrics in result["per_gene"].items():
        print(gene, metrics["MAE_masked"])

1.7261713809819303 0.8778019861304599 -0.10784411082508727
Myl2 1.3379105355218053
Myl7 2.304951690893242
Tnnt2 0.9730411383810393

3D

This step is computationally intensive; we recommend running it on a machine with sufficient memory.

Here I test on a subset of the volume which contains 4 slices and 1 gene.

## read in OpenST data
gt_3D = sc.read('/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/3D spatial transcriptomics/data/3D/a_chopped.h5ad')
gt_3D

AnnData object with n_obs × n_vars = 624829 × 28943
    obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section'
    var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
    uns: 'log1p'
    obsm: 'spatial', 'spatial_3d_aligned', 'spatial_3d_aligned_rot-12'
    layers: 'raw'

import numpy as np

np.unique(gt_3D.obsm['spatial_3d_aligned_rot-12'][:, 2])

array([  57.97101449,   86.95652174,  115.94202899,  144.92753623,
        173.91304348,  202.89855072,  260.86956522,  318.84057971,
        492.75362319,  521.73913043,  550.72463768,  666.66666667,
        695.65217391,  724.63768116,  753.62318841,  811.5942029 ,
        956.52173913,  985.50724638, 1043.47826087])

coords = gt_3D.obsm["spatial_3d_aligned_rot-12"]
z_vals = coords[:, 2]
target_slices = np.sort(np.unique(z_vals))[:4]
mask = np.isin(z_vals, target_slices)

gt_3D_4slices = gt_3D[mask].copy()
gt_3D_4slices

AnnData object with n_obs × n_vars = 129203 × 28943
    obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section'
    var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
    uns: 'log1p'
    obsm: 'spatial', 'spatial_3d_aligned', 'spatial_3d_aligned_rot-12'
    layers: 'raw'

gt_3D_4slices = gt_3D_4slices[:, ["LYZ"]].copy()
gt_3D_4slices

AnnData object with n_obs × n_vars = 129203 × 1
    obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section'
    var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
    uns: 'log1p'
    obsm: 'spatial', 'spatial_3d_aligned', 'spatial_3d_aligned_rot-12'
    layers: 'raw'

## This is the reconstructed 3D data with one gene (LYZ)
res_3d = sc.read('/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/3D spatial transcriptomics/data/gene_expression/LYZ_3d-reconstructed-custom-3d-original-coords.h5ad')
res_3d

AnnData object with n_obs × n_vars = 831406 × 1
    obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'

np.unique(res_3d.obsm['spatial'][:, 2]) ## densified z-axis

array([  57.97101449,   87.83488112,  117.69870859,  147.56257523,
        177.42644186,  207.29030849,  237.15415554,  267.01800259,
        296.88186922,  326.74571627,  356.6095829 ,  386.47342995,
        416.33728679,  446.20114363,  476.06499558,  505.92885242,
        535.79271048,  565.6565661 ,  595.52042294,  625.38427978,
        655.24813173,  685.11198857,  714.97584541,  744.83969246,
        774.70355909,  804.56740614,  834.43127277,  864.29511982,
        894.15896688,  924.02283351,  953.88670014,  983.75056677,
       1013.61439424, 1043.47826087])

coords = res_3d.obsm["spatial"]
z_vals = coords[:, 2]
target_slices = np.sort(np.unique(z_vals))[:4]
mask = np.isin(z_vals, target_slices)

res_3d_4slices = res_3d[mask].copy()
res_3d_4slices

AnnData object with n_obs × n_vars = 126919 × 1
    obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'

from scipy import sparse
res_3d_4slices.X = sparse.csr_matrix(res_3d_4slices.obsm["reconstructed_raw"])

res_3d_4slices.var_names = ["LYZ"]

from unist.metrics import evaluate_specific_genes_3d

result = evaluate_specific_genes_3d(
    gt_3D_4slices,
    res_3d_4slices,
    gene_names=["LYZ"],
    bin_size=60.0, # corresponding to 20 micronmeters resolution to reduce computational cost
    true_coords_key="spatial_3d_aligned_rot-12",
    pred_coords_key="spatial",
)

overall = result["overall"]

print(f"MSE (masked): {overall['MSE_masked']:.4f}")
print(f"MAE (masked): {overall['MAE_masked']:.4f}")

MSE (masked): 0.1200
MAE (masked): 0.2656