Back to Claude Scientific Skills

Concatenating AnnData Objects

skills/anndata/references/concatenation.md

2.46.010.4 KB
Original Source

Concatenating AnnData Objects

Combine multiple AnnData objects along either observations or variables axis.

Basic Concatenation

Concatenate along observations (stack cells/samples)

python
import anndata as ad
import numpy as np

# Create multiple AnnData objects
adata1 = ad.AnnData(X=np.random.rand(100, 50))
adata2 = ad.AnnData(X=np.random.rand(150, 50))
adata3 = ad.AnnData(X=np.random.rand(200, 50))

# Concatenate along observations (axis=0, default)
adata_combined = ad.concat([adata1, adata2, adata3], axis=0)

print(adata_combined.shape)  # (450, 50)

Concatenate along variables (stack genes/features)

python
# Create objects with same observations, different variables
adata1 = ad.AnnData(X=np.random.rand(100, 50))
adata2 = ad.AnnData(X=np.random.rand(100, 30))
adata3 = ad.AnnData(X=np.random.rand(100, 70))

# Concatenate along variables (axis=1)
adata_combined = ad.concat([adata1, adata2, adata3], axis=1)

print(adata_combined.shape)  # (100, 150)

Join Types

Inner join (intersection)

Keep only variables/observations present in all objects.

python
import pandas as pd

# Create objects with different variables
adata1 = ad.AnnData(
    X=np.random.rand(100, 50),
    var=pd.DataFrame(index=[f'Gene_{i}' for i in range(50)])
)
adata2 = ad.AnnData(
    X=np.random.rand(150, 60),
    var=pd.DataFrame(index=[f'Gene_{i}' for i in range(10, 70)])
)

# Inner join: only genes 10-49 are kept (overlap)
adata_inner = ad.concat([adata1, adata2], join='inner')
print(adata_inner.n_vars)  # 40 genes (overlap)

Outer join (union)

Keep all variables/observations, filling missing values.

python
# Outer join: all genes are kept
adata_outer = ad.concat([adata1, adata2], join='outer')
print(adata_outer.n_vars)  # 70 genes (union)

# Missing values are filled with appropriate defaults:
# - 0 for sparse matrices
# - NaN for dense matrices

Fill values for outer joins

python
# Specify fill value for missing data
adata_filled = ad.concat([adata1, adata2], join='outer', fill_value=0)

Tracking Data Sources

Add batch labels

python
# Label which object each observation came from
adata_combined = ad.concat(
    [adata1, adata2, adata3],
    label='batch',  # Column name for labels
    keys=['batch1', 'batch2', 'batch3']  # Labels for each object
)

print(adata_combined.obs['batch'].value_counts())
# batch1    100
# batch2    150
# batch3    200

Automatic batch labels

python
# If keys not provided, uses integer indices
adata_combined = ad.concat(
    [adata1, adata2, adata3],
    label='dataset'
)
# dataset column contains: 0, 1, 2

Merge Strategies

Control how metadata from different objects is combined using the merge parameter.

merge=None (default for observations)

Exclude metadata on non-concatenation axis.

python
# When concatenating observations, var metadata must match
adata1.var['gene_type'] = 'protein_coding'
adata2.var['gene_type'] = 'protein_coding'

# var is kept only if identical across all objects
adata_combined = ad.concat([adata1, adata2], merge=None)

merge='same'

Keep metadata that is identical across all objects.

python
adata1.var['chromosome'] = ['chr1'] * 25 + ['chr2'] * 25
adata2.var['chromosome'] = ['chr1'] * 25 + ['chr2'] * 25
adata1.var['type'] = 'protein_coding'
adata2.var['type'] = 'lncRNA'  # Different

# 'chromosome' is kept (same), 'type' is excluded (different)
adata_combined = ad.concat([adata1, adata2], merge='same')

merge='unique'

Keep metadata columns where each key has exactly one value.

python
adata1.var['gene_id'] = [f'ENSG{i:05d}' for i in range(50)]
adata2.var['gene_id'] = [f'ENSG{i:05d}' for i in range(50)]

# gene_id is kept (unique values for each key)
adata_combined = ad.concat([adata1, adata2], merge='unique')

merge='first'

Take values from the first object containing each key.

python
adata1.var['description'] = ['Desc1'] * 50
adata2.var['description'] = ['Desc2'] * 50

# Uses descriptions from adata1
adata_combined = ad.concat([adata1, adata2], merge='first')

merge='only'

Keep metadata that appears in only one object.

python
adata1.var['adata1_specific'] = [1] * 50
adata2.var['adata2_specific'] = [2] * 50

# Both metadata columns are kept
adata_combined = ad.concat([adata1, adata2], merge='only')

Handling Index Conflicts

Make indices unique

python
import pandas as pd

# Create objects with overlapping observation names
adata1 = ad.AnnData(
    X=np.random.rand(3, 10),
    obs=pd.DataFrame(index=['cell_1', 'cell_2', 'cell_3'])
)
adata2 = ad.AnnData(
    X=np.random.rand(3, 10),
    obs=pd.DataFrame(index=['cell_1', 'cell_2', 'cell_3'])
)

# Make indices unique by appending batch keys
adata_combined = ad.concat(
    [adata1, adata2],
    label='batch',
    keys=['batch1', 'batch2'],
    index_unique='_'  # Separator for making indices unique
)

print(adata_combined.obs_names)
# ['cell_1_batch1', 'cell_2_batch1', 'cell_3_batch1',
#  'cell_1_batch2', 'cell_2_batch2', 'cell_3_batch2']

Concatenating Layers

python
# Objects with layers
adata1 = ad.AnnData(X=np.random.rand(100, 50))
adata1.layers['normalized'] = np.random.rand(100, 50)
adata1.layers['scaled'] = np.random.rand(100, 50)

adata2 = ad.AnnData(X=np.random.rand(150, 50))
adata2.layers['normalized'] = np.random.rand(150, 50)
adata2.layers['scaled'] = np.random.rand(150, 50)

# Layers are concatenated automatically if present in all objects
adata_combined = ad.concat([adata1, adata2])

print(adata_combined.layers.keys())
# dict_keys(['normalized', 'scaled'])

Concatenating Multi-dimensional Annotations

obsm/varm

python
# Objects with embeddings
adata1.obsm['X_pca'] = np.random.rand(100, 50)
adata2.obsm['X_pca'] = np.random.rand(150, 50)

# obsm is concatenated along observation axis
adata_combined = ad.concat([adata1, adata2])
print(adata_combined.obsm['X_pca'].shape)  # (250, 50)

obsp/varp (pairwise annotations)

python
from scipy.sparse import csr_matrix

# Pairwise matrices
adata1.obsp['connectivities'] = csr_matrix((100, 100))
adata2.obsp['connectivities'] = csr_matrix((150, 150))

# By default, obsp is NOT concatenated (set pairwise=True to include)
adata_combined = ad.concat([adata1, adata2])
# adata_combined.obsp is empty

# Include pairwise data (creates block diagonal matrix)
adata_combined = ad.concat([adata1, adata2], pairwise=True)
print(adata_combined.obsp['connectivities'].shape)  # (250, 250)

Concatenating uns (unstructured)

Unstructured metadata is merged recursively:

python
adata1.uns['experiment'] = {'date': '2025-01-01', 'batch': 'A'}
adata2.uns['experiment'] = {'date': '2025-01-01', 'batch': 'B'}

# Using merge='unique' for uns
adata_combined = ad.concat([adata1, adata2], uns_merge='unique')
# 'date' is kept (same value), 'batch' might be excluded (different values)

Lazy Concatenation (AnnCollection)

For very large datasets, use AnnCollection to lazily concatenate AnnData objects along the observation axis. This API is experimental; use backed AnnData objects when the inputs are stored in .h5ad files.

python
import anndata as ad
from anndata.experimental import AnnCollection

files = ['data1.h5ad', 'data2.h5ad', 'data3.h5ad']
backed_adatas = [ad.read_h5ad(path, backed='r') for path in files]

collection = AnnCollection(
    backed_adatas,
    join_obs='outer',
    join_vars='inner',
    label='dataset',
    keys=['dataset1', 'dataset2', 'dataset3']
)

# Access data lazily
print(collection.n_obs)  # Total observations
print(collection.obs.head())  # Metadata loaded, not X

# Convert to regular AnnData when needed (loads all data)
adata = collection.to_adata()

Working with AnnCollection

python
# Subset without loading data
subset = collection[collection.obs['cell_type'] == 'T cell']

# Iterate through datasets
for adata in collection:
    print(adata.shape)

# Access specific dataset
first_dataset = collection[0]

Concatenation on Disk

For datasets too large for memory, concatenate directly on disk:

python
import anndata as ad
from anndata.experimental import concat_on_disk

# Concatenate without loading into memory
concat_on_disk(
    ['data1.h5ad', 'data2.h5ad', 'data3.h5ad'],
    'combined.h5ad',
    join='outer'
)

# Load result in backed mode
adata = ad.read_h5ad('combined.h5ad', backed='r')

Common Concatenation Patterns

Combine technical replicates

python
# Multiple runs of the same samples
replicates = [adata_run1, adata_run2, adata_run3]
adata_combined = ad.concat(
    replicates,
    label='technical_replicate',
    keys=['rep1', 'rep2', 'rep3'],
    join='inner'  # Keep only genes measured in all runs
)

Combine batches from experiment

python
# Different experimental batches
batches = [adata_batch1, adata_batch2, adata_batch3]
adata_combined = ad.concat(
    batches,
    label='batch',
    keys=['batch1', 'batch2', 'batch3'],
    join='outer'  # Keep all genes
)

# Later: apply batch correction

Merge multi-modal data

python
# Different measurement modalities (e.g., RNA + protein)
adata_rna = ad.AnnData(X=np.random.rand(100, 2000))
adata_protein = ad.AnnData(X=np.random.rand(100, 50))

# Concatenate along variables to combine modalities
adata_multimodal = ad.concat([adata_rna, adata_protein], axis=1)

# Add labels to distinguish modalities
adata_multimodal.var['modality'] = ['RNA'] * 2000 + ['protein'] * 50

Best Practices

  1. Check compatibility before concatenating
python
# Verify shapes are compatible
print([adata.n_vars for adata in [adata1, adata2, adata3]])

# Check variable names match
print([set(adata.var_names) for adata in [adata1, adata2, adata3]])
  1. Use appropriate join type
  • inner: When you need the same features across all samples (most stringent)
  • outer: When you want to preserve all features (most inclusive)
  1. Track data sources Always use label and keys to track which observations came from which dataset.

  2. Consider memory usage

  • For large datasets, use AnnCollection or concat_on_disk
  • Consider backed mode for the result
  1. Handle batch effects Concatenation combines data but doesn't correct for batch effects. Apply batch correction after concatenation:
python
# After concatenation, apply batch correction
import scanpy as sc
sc.pp.combat(adata_combined, key='batch')
  1. Validate results
python
# Check dimensions
print(adata_combined.shape)

# Check batch distribution
print(adata_combined.obs['batch'].value_counts())

# Verify metadata integrity
print(adata_combined.var.head())
print(adata_combined.obs.head())