% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Jarchow:281829,
author = {Jarchow, Hans and Bobrowski, Christoph and Falk, Steffi and
Hermann, Andreas and Kulaga, Anton and Põder,
Johann-Christian and Unfried, Maximilian and Usanov, Nikolay
and Zendeh, Bijan and Kennedy, Brian K and Lobentanzer,
Sebastian and Fuellen, Georg},
title = {{B}enchmarking large language models for personalized,
biomarker-based health intervention recommendations.},
journal = {npj digital medicine},
volume = {8},
number = {1},
issn = {2398-6352},
address = {[Basingstoke]},
publisher = {Macmillan Publishers Limited},
reportid = {DZNE-2025-01210},
pages = {631},
year = {2025},
abstract = {The use of large language models (LLMs) in clinical
diagnostics and intervention planning is expanding, yet
their utility for personalized recommendations for longevity
interventions remains opaque. We extended the BioChatter
framework to benchmark LLMs' ability to generate
personalized longevity intervention recommendations based on
biomarker profiles while adhering to key medical validation
requirements. Using 25 individual profiles across three
different age groups, we generated 1000 diverse test cases
covering interventions such as caloric restriction, fasting
and supplements. Evaluating 56000 model responses via an
LLM-as-a-Judge system with clinician validated ground
truths, we found that proprietary models outperformed
open-source models especially in comprehensiveness. However,
even with Retrieval-Augmented Generation (RAG), all models
exhibited limitations in addressing key medical validation
requirements, prompt stability, and handling age-related
biases. Our findings highlight limited suitability of LLMs
for unsupervised longevity intervention recommendations. Our
open-source framework offers a foundation for advancing AI
benchmarking in various medical contexts.},
cin = {AG Hermann},
ddc = {610},
cid = {I:(DE-2719)1511100},
pnm = {353 - Clinical and Health Care Research (POF4-353)},
pid = {G:(DE-HGF)POF4-353},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:41145883},
doi = {10.1038/s41746-025-01996-2},
url = {https://pub.dzne.de/record/281829},
}