% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Stber:273976,
author = {Stüber, Anna Theresa and Heimer, Maurice M and Ta, Johanna
and Fabritius, Matthias P and Hoppe, Boj F and Sheikh,
Gabriel and Brendel, Matthias and Unterrainer, Lena and
Jurmeister, Philip and Tufman, Amanda and Ricke, Jens and
Cyran, Clemens C and Ingrisch, Michael},
title = {{R}eplication study of {PD}-{L}1 status prediction in
{NSCLC} using {PET}/{CT} radiomics.},
journal = {European journal of radiology},
volume = {183},
issn = {0720-048X},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {DZNE-2024-01425},
pages = {111825},
year = {2025},
abstract = {This study investigates the predictive capability of
radiomics in determining programmed cell death ligand 1
(PD-L1) expression $(>=1\%)$ status in non-small cell lung
cancer (NSCLC) patients using a newly collected [18F]FDG
PET/CT dataset. We aimed to replicate and validate the
radiomics-based machine learning (ML) model proposed by Zhao
et al. [1] predicting PD-L1 status from PET/CT-imaging. An
independent cohort of 254 NSCLC patients underwent [18F]FDG
PET/CT imaging, with primary tumor segmentation conducted
using lung tissue window (LTW) and more conservative soft
tissue window (STW) methods. Radiomics models ('Rad-score'
and 'complex model') and a clinical-stage model from Zhao et
al. were evaluated via 10-fold cross-validation and AUC
analysis, alongside a benchmark-study comparing different
ML-model pipelines. Clinicopathological data were collected
from medical records. On our data, the Rad-score model
yielded mean AUCs of 0.593 (STW) and 0.573 (LTW), below Zhao
et al.'s 0.761. The complex model achieved mean AUCs of
0.505 (STW) and 0.519 (LTW), lower than Zhao et al.'s 0.769.
The clinical model showed a mean AUC of 0.555, below Zhao et
al.'s 0.64. All models performed significantly lower than
Zhao et al.'s findings. Our benchmark study on four ML
pipelines revealed consistently low performance across all
configurations. Our study failed to replicate original
findings, suggesting poor model performance and questioning
predictive value of radiomics features in classifying PD-L1
expression from PET/CT imaging. These results highlight
challenges in replicating radiomics-based ML models and
stress the need for rigorous validation.},
keywords = {Machine learning benchmark (Other) / NSCLC (Other) / PD-L1
(Other) / PET/CT imaging data (Other) / Radiomics (Other) /
Replication study (Other)},
cin = {AG Haass},
ddc = {610},
cid = {I:(DE-2719)1110007},
pnm = {352 - Disease Mechanisms (POF4-352)},
pid = {G:(DE-HGF)POF4-352},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:39657546},
doi = {10.1016/j.ejrad.2024.111825},
url = {https://pub.dzne.de/record/273976},
}