skills/cellxgene-census/references/common_patterns.md
Use when exploring available data without loading expression matrices.
Pattern: Get unique cell types in a tissue
import cellxgene_census
with cellxgene_census.open_soma() as census:
cell_metadata = cellxgene_census.get_obs(
census,
"homo_sapiens",
value_filter="tissue_general == 'brain' and is_primary_data == True",
column_names=["cell_type"]
)
unique_cell_types = cell_metadata["cell_type"].unique()
print(f"Found {len(unique_cell_types)} unique cell types")
Pattern: Count cells by condition
cell_metadata = cellxgene_census.get_obs(
census,
"homo_sapiens",
value_filter="disease != 'normal' and is_primary_data == True",
column_names=["disease", "tissue_general"]
)
counts = cell_metadata.groupby(["disease", "tissue_general"]).size()
Pattern: Explore dataset information
# Access datasets table
datasets = census["census_info"]["datasets"].read().concat().to_pandas()
# Filter for specific criteria
covid_datasets = datasets[datasets["disease"].str.contains("COVID", na=False)]
Use get_anndata() when results fit in memory (typically < 100k cells).
Pattern: Tissue-specific cell type query
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="cell_type == 'B cell' and tissue_general == 'lung' and is_primary_data == True",
obs_column_names=["assay", "disease", "sex", "donor_id"],
)
Pattern: Gene-specific query with multiple genes
marker_genes = ["CD4", "CD8A", "CD19", "FOXP3"]
# First get gene IDs
gene_metadata = cellxgene_census.get_var(
census, "homo_sapiens",
value_filter=f"feature_name in {marker_genes}",
column_names=["feature_id", "feature_name"]
)
gene_ids = gene_metadata["feature_id"].tolist()
# Query with gene filter
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
var_value_filter=f"feature_id in {gene_ids}",
obs_value_filter="cell_type == 'T cell' and is_primary_data == True",
)
Pattern: Multi-tissue query
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="tissue_general in ['lung', 'liver', 'kidney'] and is_primary_data == True",
obs_column_names=["cell_type", "tissue_general", "dataset_id"],
)
Pattern: Disease-specific query
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="disease == 'COVID-19' and tissue_general == 'lung' and is_primary_data == True",
)
Use axis_query() for queries that exceed available RAM.
Pattern: Iterative processing
import tiledbsoma as soma
# Create query
with census["census_data"]["homo_sapiens"].axis_query(
measurement_name="RNA",
obs_query=soma.AxisQuery(
value_filter="tissue_general == 'brain' and is_primary_data == True"
),
var_query=soma.AxisQuery(
value_filter="feature_name in ['FOXP2', 'TBR1', 'SATB2']"
),
) as query:
# Iterate through X matrix in chunks
iterator = query.X("raw").tables()
for batch in iterator:
# Process batch (a pyarrow.Table)
# batch has columns: soma_data, soma_dim_0, soma_dim_1
process_batch(batch)
Pattern: Incremental statistics (mean/variance)
import tiledbsoma as soma
# Using Welford's online algorithm
n = 0
mean = 0
M2 = 0
with census["census_data"]["homo_sapiens"].axis_query(
measurement_name="RNA",
obs_query=soma.AxisQuery(value_filter="tissue_general == 'brain' and is_primary_data == True"),
var_query=soma.AxisQuery(value_filter="feature_name in ['FOXP2', 'TBR1', 'SATB2']"),
) as query:
iterator = query.X("raw").tables()
for batch in iterator:
values = batch["soma_data"].to_numpy()
for x in values:
n += 1
delta = x - mean
mean += delta / n
delta2 = x - mean
M2 += delta * delta2
variance = M2 / (n - 1) if n > 1 else 0
Use TileDB-SOMA-ML for training models. The former cellxgene_census.experimental.ml loaders are deprecated and scheduled for removal.
Pattern: Create training dataloader
import tiledbsoma as soma
from tiledbsoma_ml import ExperimentDataset, experiment_dataloader
with cellxgene_census.open_soma() as census:
experiment = census["census_data"]["homo_sapiens"]
with experiment.axis_query(
measurement_name="RNA",
obs_query=soma.AxisQuery(
value_filter="tissue_general == 'liver' and is_primary_data == True"
),
) as query:
dataset = ExperimentDataset(
query=query,
layer_name="raw",
obs_column_names=["cell_type"],
batch_size=128,
shuffle=True,
)
dataloader = experiment_dataloader(dataset)
for epoch in range(num_epochs):
dataset.set_epoch(epoch)
for X, obs in dataloader:
labels = obs["cell_type"]
# Train model...
Pattern: Train/test split
# Split data
train_dataset, test_dataset = dataset.random_split(0.8, 0.2, seed=42)
# Create loaders
train_loader = experiment_dataloader(train_dataset, num_workers=2)
test_loader = experiment_dataloader(test_dataset, num_workers=2)
Set batch_size and shuffle on ExperimentDataset, not on the PyTorch DataLoader.
Use the cellxgene-census[spatial] extra and query the census_spatial_sequencing collection for Visium or Slide-seq V2 data.
import tiledbsoma as soma
with cellxgene_census.open_soma(census_version="2025-11-08") as census:
spatial_experiment = census["census_spatial_sequencing"]["homo_sapiens"]
with spatial_experiment.axis_query(
measurement_name="RNA",
obs_query=soma.AxisQuery(
value_filter="dataset_id == '4cceac62-9513-42a4-90e5-2878dbb0192c'"
),
) as query:
sdata = query.to_spatialdata(X_name="raw")
Pattern: Scanpy integration
import scanpy as sc
# Load data
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="cell_type == 'neuron' and is_primary_data == True",
)
# Standard scanpy workflow
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(adata, color=["cell_type", "tissue_general"])
Pattern: Multi-dataset integration
# Query multiple datasets separately
datasets_to_integrate = ["dataset_id_1", "dataset_id_2", "dataset_id_3"]
adatas = []
for dataset_id in datasets_to_integrate:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter=f"dataset_id == '{dataset_id}' and is_primary_data == True",
)
adatas.append(adata)
# Integrate using scanorama, harmony, or other tools
import scanpy.external as sce
sce.pp.scanorama_integrate(adatas)
Unless specifically analyzing duplicates, always include is_primary_data == True:
obs_value_filter="cell_type == 'B cell' and is_primary_data == True"
For reproducible analysis, always specify the Census version:
census = cellxgene_census.open_soma(census_version="2025-11-08")
Always use the context manager to ensure proper cleanup:
with cellxgene_census.open_soma() as census:
# Your code here
Minimize data transfer by selecting only required metadata columns:
obs_column_names=["cell_type", "tissue_general", "disease"] # Not all columns
When analyzing specific genes, check which datasets measured them:
presence = cellxgene_census.get_presence_matrix(
census,
"homo_sapiens",
var_value_filter="feature_name in ['CD4', 'CD8A']"
)
tissue_general provides coarser groupings than tissue, useful for cross-tissue analyses:
# Better for broad queries
obs_value_filter="tissue_general == 'immune system'"
# Use specific tissue when needed
obs_value_filter="tissue == 'peripheral blood mononuclear cell'"
First explore metadata to understand available data, then query expression:
# Step 1: Explore
metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="disease == 'COVID-19'",
column_names=["cell_type", "tissue_general"]
)
print(metadata.value_counts())
# Step 2: Query based on findings
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter="disease == 'COVID-19' and cell_type == 'T cell' and is_primary_data == True",
)
For large queries, check estimated size before loading:
# Get cell count first
metadata = cellxgene_census.get_obs(
census, "homo_sapiens",
value_filter="tissue_general == 'brain' and is_primary_data == True",
column_names=["soma_joinid"]
)
n_cells = len(metadata)
print(f"Query will return {n_cells} cells")
# If too large, use out-of-core processing or further filtering
When possible, use ontology term IDs instead of free text:
# More reliable than cell_type == 'B cell' across datasets
obs_value_filter="cell_type_ontology_term_id == 'CL:0000236'"
For systematic analyses across multiple conditions:
tissues = ["lung", "liver", "kidney", "heart"]
results = {}
for tissue in tissues:
adata = cellxgene_census.get_anndata(
census=census,
organism="Homo sapiens",
obs_value_filter=f"tissue_general == '{tissue}' and is_primary_data == True",
)
# Perform analysis
results[tissue] = analyze(adata)