skills/pyopenms/references/feature_detection.md
Feature detection identifies persistent signals (chromatographic peaks) in LC-MS data. Feature linking combines features across multiple samples for quantitative comparison.
Ready-to-run scripts: The skill ships CLIs that implement these workflows end to end:
scripts/detect_features_metabo.py(metabolomics),scripts/detect_features_centroided.py(proteomics/centroided),scripts/align_link_quantify.py(alignment + linking + quant matrix), andscripts/detect_adducts.py(adduct grouping). Use them directly, or adapt the code below.
API note (pyOpenMS 3.5.0): The old
FeatureFinderclass and itsrun("centroided", ...)API were removed. Metabolomics now uses theMassTraceDetection->ElutionPeakDetection->FeatureFindingMetabopipeline, and centroided/proteomics data usesFeatureFinderAlgorithmPicked. The patterns below reflect the current API.
A feature represents a chromatographic peak characterized by:
For small molecules, run the three-stage pipeline that replaced the removed FeatureFinder: detect mass traces, split them into elution peaks, then assemble isotope-grouped features.
import pyopenms as ms
# Load centroided data
exp = ms.MSExperiment()
ms.MzMLFile().load("centroided.mzML", exp)
exp.sortSpectra(True)
# Stage 1: mass trace detection
mtd = ms.MassTraceDetection()
p = mtd.getDefaults()
p.setValue("mass_error_ppm", 10.0)
p.setValue("noise_threshold_int", 1000.0)
mtd.setParameters(p)
mass_traces = []
mtd.run(exp, mass_traces, 0)
# Stage 2: elution peak detection
epd = ms.ElutionPeakDetection()
p = epd.getDefaults()
p.setValue("width_filtering", "fixed")
epd.setParameters(p)
mt_split = []
epd.detectPeaks(mass_traces, mt_split)
# Stage 3: feature assembly with isotope grouping
ffm = ms.FeatureFindingMetabo()
p = ffm.getDefaults()
p.setValue("isotope_filtering_model", "metabolites (5% RMS)") # or "none"
p.setValue("remove_single_traces", "true")
p.setValue("charge_lower_bound", 1)
p.setValue("charge_upper_bound", 3)
ffm.setParameters(p)
features = ms.FeatureMap()
chrom_out = []
ffm.run(mt_split, features, chrom_out)
print(f"Detected {features.size()} features")
# Save features
ms.FeatureXMLFile().store("features.featureXML", features)
For centroided peptide data, use FeatureFinderAlgorithmPicked (replaces the removed FeatureFinder "centroided" workflow):
exp = ms.MSExperiment()
ms.MzMLFile().load("centroided.mzML", exp)
exp.sortSpectra(True)
exp.updateRanges()
ff = ms.FeatureFinderAlgorithmPicked()
params = ff.getDefaults()
params.setValue("isotopic_pattern:charge_low", 1)
params.setValue("isotopic_pattern:charge_high", 4)
features = ms.FeatureMap()
seeds = ms.FeatureMap()
# signature: run(input_map, output, param, seeds)
ff.run(exp, features, params, seeds)
print(f"Detected {features.size()} features")
ms.FeatureXMLFile().store("features.featureXML", features)
# Load features
feature_map = ms.FeatureMap()
ms.FeatureXMLFile().load("features.featureXML", feature_map)
# Access individual features
for feature in feature_map:
print(f"m/z: {feature.getMZ():.4f}")
print(f"RT: {feature.getRT():.2f}")
print(f"Intensity: {feature.getIntensity():.0f}")
print(f"Charge: {feature.getCharge()}")
print(f"Quality: {feature.getOverallQuality():.3f}")
print(f"Width (RT): {feature.getWidth():.2f}")
# Get convex hull
hull = feature.getConvexHull()
print(f"Hull points: {hull.getHullPoints().size()}")
# Access isotopic pattern
for feature in feature_map:
# Get subordinate features (isotopes)
subordinates = feature.getSubordinates()
if subordinates:
print(f"Main feature m/z: {feature.getMZ():.4f}")
for sub in subordinates:
print(f" Isotope m/z: {sub.getMZ():.4f}")
print(f" Isotope intensity: {sub.getIntensity():.0f}")
import pandas as pd
# Convert to DataFrame
df = feature_map.get_df()
print(df.columns)
# Columns are lowercase: rt, mz, intensity, charge, quality
# Analyze features
print(f"Mean intensity: {df['intensity'].mean()}")
print(f"RT range: {df['rt'].min():.1f} - {df['rt'].max():.1f}")
Align retention times before linking:
# Load multiple feature maps
fm1 = ms.FeatureMap()
fm2 = ms.FeatureMap()
ms.FeatureXMLFile().load("sample1.featureXML", fm1)
ms.FeatureXMLFile().load("sample2.featureXML", fm2)
feature_maps = [fm1, fm2]
# Pick the largest map as the alignment reference
aligner = ms.MapAlignmentAlgorithmPoseClustering()
ref_idx = max(range(len(feature_maps)), key=lambda i: feature_maps[i].size())
aligner.setReference(feature_maps[ref_idx])
# Align each non-reference map in place against the reference
transformer = ms.MapAlignmentTransformer()
for i, fm in enumerate(feature_maps):
if i == ref_idx:
continue
trafo = ms.TransformationDescription()
aligner.align(fm, trafo)
transformer.transformRetentionTimes(fm, trafo, True)
Link features across samples:
# Create feature grouping algorithm
grouper = ms.FeatureGroupingAlgorithmQT()
# Configure parameters
params = grouper.getParameters()
params.setValue("distance_RT:max_difference", 30.0) # Max RT difference (s)
params.setValue("distance_MZ:max_difference", 10.0) # Max m/z difference (ppm)
params.setValue("distance_MZ:unit", "ppm")
grouper.setParameters(params)
# Prepare feature maps
feature_maps = [fm1, fm2, fm3]
# Create consensus map
consensus_map = ms.ConsensusMap()
# Link features (feature_maps is a list of FeatureMap)
grouper.group(feature_maps, consensus_map)
# Assign unique IDs before storing
consensus_map.setUniqueIds()
print(f"Created {consensus_map.size()} consensus features")
# Save consensus map
ms.ConsensusXMLFile().store("consensus.consensusXML", consensus_map)
# Load consensus map
consensus_map = ms.ConsensusMap()
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
# Iterate through consensus features
for cons_feature in consensus_map:
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
print(f"Consensus RT: {cons_feature.getRT():.2f}")
# Get features from individual maps
for handle in cons_feature.getFeatureList():
map_idx = handle.getMapIndex()
intensity = handle.getIntensity()
print(f" Sample {map_idx}: intensity {intensity:.0f}")
# Access file descriptions (map metadata)
file_descriptions = consensus_map.getColumnHeaders()
for map_idx, description in file_descriptions.items():
print(f"Map {map_idx}:")
print(f" Filename: {description.filename}")
print(f" Label: {description.label}")
print(f" Size: {description.size}")
ConsensusMap exposes two DataFrame helpers that make quantitative tables easy:
# Feature intensities, features (rows) x samples (columns)
intensity_df = consensus_map.get_intensity_df()
# Per-consensus-feature metadata: rt, mz, charge, quality
metadata_df = consensus_map.get_metadata_df()
# Join into a single annotated quant matrix
quant = metadata_df.join(intensity_df)
Identify different ionization forms of the same molecule. The class is MetaboliteFeatureDeconvolution (the old MetaboliteAdductDecharger does not exist in 3.5.0). Adducts are specified with Elements:Charge:Probability syntax, not bracket notation like [M+H]+:
# Create adduct deconvolution
mfd = ms.MetaboliteFeatureDeconvolution()
# Configure parameters
p = mfd.getDefaults()
p.setValue("potential_adducts", [b"H:+:0.4", b"Na:+:0.25", b"NH4:+:0.25", b"K:+:0.1", b"H-2O-1:0:0.05"])
p.setValue("charge_min", 1)
p.setValue("charge_max", 1)
mfd.setParameters(p)
# Detect adducts: compute(in, out, cons_groups, cons_edges)
fm_out = ms.FeatureMap()
groups = ms.ConsensusMap()
edges = ms.ConsensusMap()
mfd.compute(feature_map, fm_out, groups, edges)
import pyopenms as ms
def feature_detection_workflow(input_files, output_consensus):
"""
Complete workflow: feature detection and linking across samples.
Args:
input_files: List of mzML file paths
output_consensus: Output consensusXML file path
"""
feature_maps = []
# Step 1: Detect features in each file (metabolomics pipeline)
for mzml_file in input_files:
print(f"Processing {mzml_file}...")
# Load experiment
exp = ms.MSExperiment()
ms.MzMLFile().load(mzml_file, exp)
exp.sortSpectra(True)
# Mass trace detection
mtd = ms.MassTraceDetection()
p = mtd.getDefaults()
p.setValue("mass_error_ppm", 10.0)
p.setValue("noise_threshold_int", 1000.0)
mtd.setParameters(p)
mass_traces = []
mtd.run(exp, mass_traces, 0)
# Elution peak detection
epd = ms.ElutionPeakDetection()
p = epd.getDefaults()
p.setValue("width_filtering", "fixed")
epd.setParameters(p)
mt_split = []
epd.detectPeaks(mass_traces, mt_split)
# Feature assembly
ffm = ms.FeatureFindingMetabo()
p = ffm.getDefaults()
p.setValue("isotope_filtering_model", "metabolites (5% RMS)")
p.setValue("remove_single_traces", "true")
p.setValue("charge_lower_bound", 1)
p.setValue("charge_upper_bound", 3)
ffm.setParameters(p)
features = ms.FeatureMap()
chrom_out = []
ffm.run(mt_split, features, chrom_out)
# Store filename in feature map
features.setPrimaryMSRunPath([mzml_file.encode()])
feature_maps.append(features)
print(f" Found {features.size()} features")
# Step 2: Align retention times against the largest map
print("Aligning retention times...")
aligner = ms.MapAlignmentAlgorithmPoseClustering()
ref_idx = max(range(len(feature_maps)), key=lambda i: feature_maps[i].size())
aligner.setReference(feature_maps[ref_idx])
transformer = ms.MapAlignmentTransformer()
for i, fm in enumerate(feature_maps):
if i == ref_idx:
continue
trafo = ms.TransformationDescription()
aligner.align(fm, trafo)
transformer.transformRetentionTimes(fm, trafo, True)
# Step 3: Link features
print("Linking features across samples...")
grouper = ms.FeatureGroupingAlgorithmQT()
params = grouper.getParameters()
params.setValue("distance_RT:max_difference", 30.0)
params.setValue("distance_MZ:max_difference", 10.0)
params.setValue("distance_MZ:unit", "ppm")
grouper.setParameters(params)
consensus_map = ms.ConsensusMap()
grouper.group(feature_maps, consensus_map)
consensus_map.setUniqueIds()
# Save results
ms.ConsensusXMLFile().store(output_consensus, consensus_map)
print(f"Created {consensus_map.size()} consensus features")
print(f"Results saved to {output_consensus}")
return consensus_map
# Run workflow
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
consensus = feature_detection_workflow(input_files, "consensus.consensusXML")
# Filter features by quality score
filtered_features = ms.FeatureMap()
for feature in feature_map:
if feature.getOverallQuality() > 0.5: # Quality threshold
filtered_features.push_back(feature)
print(f"Kept {filtered_features.size()} high-quality features")
# Keep only intense features
min_intensity = 10000
filtered_features = ms.FeatureMap()
for feature in feature_map:
if feature.getIntensity() >= min_intensity:
filtered_features.push_back(feature)
# Extract features in specific m/z range
mz_min = 200.0
mz_max = 800.0
filtered_features = ms.FeatureMap()
for feature in feature_map:
mz = feature.getMZ()
if mz_min <= mz <= mz_max:
filtered_features.push_back(feature)
# Annotate features with peptide identifications
# Load identifications
# pyOpenMS 3.5+: peptide IDs must be a PeptideIdentificationList, not a plain list
protein_ids = []
peptide_ids = ms.PeptideIdentificationList()
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
# Create ID mapper
mapper = ms.IDMapper()
# Map IDs to features
mapper.annotate(feature_map, peptide_ids, protein_ids)
# Check annotations
for feature in feature_map:
peptide_ids_for_feature = feature.getPeptideIdentifications()
if peptide_ids_for_feature:
print(f"Feature at {feature.getMZ():.4f} m/z identified")
Optimize parameters for your data type:
# Test different mass-trace tolerance values (metabolomics pipeline)
mz_tolerances = [5.0, 10.0, 20.0] # ppm
for tol in mz_tolerances:
mtd = ms.MassTraceDetection()
p = mtd.getDefaults()
p.setValue("mass_error_ppm", tol)
p.setValue("noise_threshold_int", 1000.0)
mtd.setParameters(p)
mass_traces = []
mtd.run(exp, mass_traces, 0)
epd = ms.ElutionPeakDetection()
mt_split = []
epd.detectPeaks(mass_traces, mt_split)
ffm = ms.FeatureFindingMetabo()
features = ms.FeatureMap()
chrom_out = []
ffm.run(mt_split, features, chrom_out)
print(f"Tolerance {tol} ppm: {features.size()} features")
Export features for visualization:
# Convert to DataFrame for plotting
df = feature_map.get_df()
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(df['rt'], df['mz'], s=df['intensity']/1000, alpha=0.5)
plt.xlabel('Retention Time (s)')
plt.ylabel('m/z')
plt.title('Feature Map')
plt.colorbar(label='Intensity (scaled)')
plt.show()