@inproceedings{29e4841176cd4536ba8265a022ab4cff,
title = "MassSpecGym: A benchmark for the discovery and identification of molecules",
abstract = "The discovery and identification of molecules in biological and environmental samples is crucial for advancing biomedical and chemical sciences. Tandem mass spectrometry (MS/MS) is the leading technique for high-throughput elucidation of molecular structures. However, decoding a molecular structure from its mass spectrum is exceptionally challenging, even when performed by human experts. As a result, the vast majority of acquired MS/MS spectra remain uninterpreted, thereby limiting our understanding of the underlying (bio)chemical processes. Despite decades of progress in machine learning applications for predicting molecular structures from MS/MS spectra, the development of new methods is severely hindered by the lack of standard datasets and evaluation protocols. To address this problem, we propose MassSpecGym - the first comprehensive benchmark for the discovery and identification of molecules from MS/MS data. Our benchmark comprises the largest publicly available collection of high-quality labeled MS/MS spectra and defines three MS/MS annotation challenges: de novo molecular structure generation, molecule retrieval, and spectrum simulation. It includes new evaluation metrics and a generalization-demanding data split, therefore standardizing the MS/MS annotation tasks and rendering the problem accessible to the broad machine learning community. MassSpecGym is publicly available at https://github.com/pluskal-lab/MassSpecGym.",
author = "Roman Bushuiev and Anton Bushuiev and {de Jonge}, {Niek F.} and Adamo Young and Fleming Kretschmer and Raman Samusevich and Janne Heirman and Fei Wang and Luke Zhang and Kai D{\"u}hrkop and Marcus Ludwig and Haupt, {Nils A.} and Apurva Kalia and Corinna Brungs and Robin Schmid and Russell Greiner and Bo Wang and Wishart, {David S.} and Liu, {Li Ping} and Juho Rousu and Wout Bittremieux and Hannes Rost and Mak, {Tytus D.} and Soha Hassoun and Florian Huber and {van der Hooft}, {Justin J.J.} and Stravs, {Michael A.} and Sebastian B{\"o}cker and Josef Sivic and Tom{\'a}{\v s} Pluskal",
year = "2024",
language = "English",
series = "Advances in Neural Information Processing Systems",
publisher = "NeurIPS Foundation",
editor = "A. Globerson and L. Mackey and D. Belgrave and A. Fan and U. Paquet and J. Tomczak and C. Zhang",
booktitle = "38th Conference on Neural Information Processing Systems (NeurIPS 2024)",
address = "United States",
note = "38th Conference on Neural Information Processing Systems, NeurIPS 2024 ; Conference date: 09-12-2024 Through 15-12-2024",
}