Benchmarking large language models for personalized, biomarker-based health intervention recommendations.

Jarchow, Hans; Unfried, Maximilian; Falk, Steffi; Fuellen, Georg; Lobentanzer, Sebastian; Bobrowski, Christoph; Kulaga, Anton; Kennedy, Brian K; Hermann, Andreas; Põder, Johann-Christian; Usanov, Nikolay; Zendeh, Bijan

doi:10.1038/s41746-025-01996-2

% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Jarchow:281829,
      author       = {Jarchow, Hans and Bobrowski, Christoph and Falk, Steffi and
                      Hermann, Andreas and Kulaga, Anton and Põder,
                      Johann-Christian and Unfried, Maximilian and Usanov, Nikolay
                      and Zendeh, Bijan and Kennedy, Brian K and Lobentanzer,
                      Sebastian and Fuellen, Georg},
      title        = {{B}enchmarking large language models for personalized,
                      biomarker-based health intervention recommendations.},
      journal      = {npj digital medicine},
      volume       = {8},
      number       = {1},
      issn         = {2398-6352},
      address      = {[Basingstoke]},
      publisher    = {Macmillan Publishers Limited},
      reportid     = {DZNE-2025-01210},
      pages        = {631},
      year         = {2025},
      abstract     = {The use of large language models (LLMs) in clinical
                      diagnostics and intervention planning is expanding, yet
                      their utility for personalized recommendations for longevity
                      interventions remains opaque. We extended the BioChatter
                      framework to benchmark LLMs' ability to generate
                      personalized longevity intervention recommendations based on
                      biomarker profiles while adhering to key medical validation
                      requirements. Using 25 individual profiles across three
                      different age groups, we generated 1000 diverse test cases
                      covering interventions such as caloric restriction, fasting
                      and supplements. Evaluating 56000 model responses via an
                      LLM-as-a-Judge system with clinician validated ground
                      truths, we found that proprietary models outperformed
                      open-source models especially in comprehensiveness. However,
                      even with Retrieval-Augmented Generation (RAG), all models
                      exhibited limitations in addressing key medical validation
                      requirements, prompt stability, and handling age-related
                      biases. Our findings highlight limited suitability of LLMs
                      for unsupervised longevity intervention recommendations. Our
                      open-source framework offers a foundation for advancing AI
                      benchmarking in various medical contexts.},
      cin          = {AG Hermann},
      ddc          = {610},
      cid          = {I:(DE-2719)1511100},
      pnm          = {353 - Clinical and Health Care Research (POF4-353)},
      pid          = {G:(DE-HGF)POF4-353},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {pmid:41145883},
      doi          = {10.1038/s41746-025-01996-2},
      url          = {https://pub.dzne.de/record/281829},
}

guest :: login DZNEPUB
		Search		Submit		Personalize Your alerts Your baskets Your searches		Help