Quantify gene imputation accuracy
Gene Expression Evaluation
The module also provides gene expression evaluation metrics with rasterization support for both 2D and 3D data.
Gene Expression Metrics
Masked MSE/MAE: Mean Squared/Absolute Error on regions with expression
Cosine Similarity: Mean cosine similarity (masked and unmasked)
Spearman Correlation: Mean Spearman correlation (masked and unmasked)
Pearson Correlation: Mean Pearson correlation (masked and unmasked)
IoU Mask: Intersection over Union of expression masks
import unist
import scanpy as sc
2D
## read in data
adata_path = '/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/SUICA/data/preprocessed_data/slice_440.h5ad'
gt = sc.read(adata_path)
gt
AnnData object with n_obs × n_vars = 8382 × 17649
obs: 'area', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts'
obsm: 'X_pca', 'bbox', 'spatial'
This result was obtained from Gene Imputation Module:
https://unist-tutorial.readthedocs.io/en/latest/gene_imputation.html
res = sc.read('/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/SUICA/data/preprocessed_data/reconstructed-original.h5ad')
res
AnnData object with n_obs × n_vars = 16764 × 1
obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'
### put the gene expression data into res.X
import anndata as ad
from scipy import sparse
res_new = ad.AnnData(
X=sparse.csr_matrix(res.obsm["reconstructed_raw"]),
obs=res.obs.copy(),
obsm=res.obsm.copy()
)
res_new.var_names = gt.var_names.copy()
res_new
AnnData object with n_obs × n_vars = 16764 × 17649
obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'
from unist.metrics import evaluate_gene_expression_2d
10 micronmeters resolution
All genes
metrics_2d = evaluate_gene_expression_2d(
gt, res_new,
bin_size=20.0, # corresponding to 10 micronmeters resolution
true_coords_key="spatial",
pred_coords_key="spatial"
)
print(f"MSE (masked): {metrics_2d['MSE_masked']:.4f}")
print(f"Pearson (masked): {metrics_2d['Pearson_masked']:.4f}")
MSE (masked): 3.8159
Pearson (masked): 0.2770
metrics_2d
{'bins': 33741,
'genes': 17649,
'bin_size': 20.0,
'IoU_mask': 0.3986876827615719,
'MSE_masked': 3.8158515045623447,
'MAE_masked': 1.234025159761077,
'Cosine_unmasked': 0.19846487198672683,
'Cosine_masked': 0.6738546518423421,
'Spearman_unmasked': 0.1768471134755618,
'Spearman_masked': 0.23622424796329358,
'Pearson_unmasked': 0.18163100396300597,
'Pearson_masked': 0.27697253250828124}
Specific genes
from unist.metrics import evaluate_specific_genes_2d
result = evaluate_specific_genes_2d(
gt,
res_new,
gene_names=["Myl2", "Myl7", "Tnnt2"], # or just "Myl2"
bin_size=20.0,
true_coords_key="spatial",
pred_coords_key="spatial",
)
# overall metrics(for all designated genes)
overall = result["overall"]
print(overall["MAE_masked"], overall["Cosine_masked"], overall["Spearman_masked"])
# per_gene
if "per_gene" in result:
for gene, metrics in result["per_gene"].items():
print(gene, metrics["MAE_masked"])
1.7261713809819303 0.8778019861304599 -0.10784411082508727
Myl2 1.3379105355218053
Myl7 2.304951690893242
Tnnt2 0.9730411383810393
3D
This step is computationally intensive; we recommend running it on a machine with sufficient memory.
Here I test on a subset of the volume which contains 4 slices and 1 gene.
## read in OpenST data
gt_3D = sc.read('/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/3D spatial transcriptomics/data/3D/a_chopped.h5ad')
gt_3D
AnnData object with n_obs × n_vars = 624829 × 28943
obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section'
var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
uns: 'log1p'
obsm: 'spatial', 'spatial_3d_aligned', 'spatial_3d_aligned_rot-12'
layers: 'raw'
import numpy as np
np.unique(gt_3D.obsm['spatial_3d_aligned_rot-12'][:, 2])
array([ 57.97101449, 86.95652174, 115.94202899, 144.92753623,
173.91304348, 202.89855072, 260.86956522, 318.84057971,
492.75362319, 521.73913043, 550.72463768, 666.66666667,
695.65217391, 724.63768116, 753.62318841, 811.5942029 ,
956.52173913, 985.50724638, 1043.47826087])
coords = gt_3D.obsm["spatial_3d_aligned_rot-12"]
z_vals = coords[:, 2]
target_slices = np.sort(np.unique(z_vals))[:4]
mask = np.isin(z_vals, target_slices)
gt_3D_4slices = gt_3D[mask].copy()
gt_3D_4slices
AnnData object with n_obs × n_vars = 129203 × 28943
obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section'
var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
uns: 'log1p'
obsm: 'spatial', 'spatial_3d_aligned', 'spatial_3d_aligned_rot-12'
layers: 'raw'
gt_3D_4slices = gt_3D_4slices[:, ["LYZ"]].copy()
gt_3D_4slices
AnnData object with n_obs × n_vars = 129203 × 1
obs: 'cell_ID_mask', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_reads', 'reads_per_counts', 'n_joined', 'exact_entropy', 'theoretical_entropy', 'exact_compression', 'theoretical_compression', 'n_counts', 'annotation', 'annotation_key', 'n_section'
var: 'n_cells_by_counts', 'mean_counts', 'total_counts', 'pct_dropout_by_counts'
uns: 'log1p'
obsm: 'spatial', 'spatial_3d_aligned', 'spatial_3d_aligned_rot-12'
layers: 'raw'
## This is the reconstructed 3D data with one gene (LYZ)
res_3d = sc.read('/Users/shuilan/Library/CloudStorage/OneDrive-InsideMDAnderson/3D spatial transcriptomics/data/gene_expression/LYZ_3d-reconstructed-custom-3d-original-coords.h5ad')
res_3d
AnnData object with n_obs × n_vars = 831406 × 1
obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'
np.unique(res_3d.obsm['spatial'][:, 2]) ## densified z-axis
array([ 57.97101449, 87.83488112, 117.69870859, 147.56257523,
177.42644186, 207.29030849, 237.15415554, 267.01800259,
296.88186922, 326.74571627, 356.6095829 , 386.47342995,
416.33728679, 446.20114363, 476.06499558, 505.92885242,
535.79271048, 565.6565661 , 595.52042294, 625.38427978,
655.24813173, 685.11198857, 714.97584541, 744.83969246,
774.70355909, 804.56740614, 834.43127277, 864.29511982,
894.15896688, 924.02283351, 953.88670014, 983.75056677,
1013.61439424, 1043.47826087])
coords = res_3d.obsm["spatial"]
z_vals = coords[:, 2]
target_slices = np.sort(np.unique(z_vals))[:4]
mask = np.isin(z_vals, target_slices)
res_3d_4slices = res_3d[mask].copy()
res_3d_4slices
AnnData object with n_obs × n_vars = 126919 × 1
obsm: 'fitted_embd', 'reconstructed_raw', 'spatial', 'spatial_normalized'
from scipy import sparse
res_3d_4slices.X = sparse.csr_matrix(res_3d_4slices.obsm["reconstructed_raw"])
res_3d_4slices.var_names = ["LYZ"]
from unist.metrics import evaluate_specific_genes_3d
result = evaluate_specific_genes_3d(
gt_3D_4slices,
res_3d_4slices,
gene_names=["LYZ"],
bin_size=60.0, # corresponding to 20 micronmeters resolution to reduce computational cost
true_coords_key="spatial_3d_aligned_rot-12",
pred_coords_key="spatial",
)
overall = result["overall"]
print(f"MSE (masked): {overall['MSE_masked']:.4f}")
print(f"MAE (masked): {overall['MAE_masked']:.4f}")
MSE (masked): 0.1200
MAE (masked): 0.2656