% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Ji:285039,
author = {Ji, Jiang and Li, Chenguang and Fu, Yibin and Zhao, Zihao
and Wu, Yiyang and Liang, Changhua and Wu, Yue},
title = {{C}omparison of online radiologists and large language
model chatbots in responding to common radiology-related
questions in {C}hinese: a cross-sectional comparative
analysis},
journal = {Quantitative imaging in medicine and surgery},
volume = {16},
number = {2},
issn = {2223-4292},
address = {Hong Kong},
publisher = {AME Publ.},
reportid = {DZNE-2026-00163},
pages = {129},
year = {2026},
abstract = {Background: Additional avenues for medical counseling are
needed to better serve patients. In handling medical
counseling, large language model chatbots (LLM-chatbots)
have demonstrated near-physician expertise in comprehending
enquiries and providing professional advice. However, their
performance in addressing patients’ common
radiology-related concerns has yet to be evaluated. This
study thus aimed to investigate the effectiveness and model
performance of LLM-chatbots (DeepSeek-R1 and ChatGPT-4o) in
radiology-related medical consultation in the Chinese
context through both subjective evaluations and objective
metrics.Methods: In this cross-sectional study, common
radiology-related questions were collected from the HaoDF
online platform, one of the largest Chinese public
healthcare service platforms. All questions were posed to
the LLM-chatbots from February 24 to February 30, 2025. To
facilitate comparison between LLM-chatbots and online
radiologists, three senior radiologists from different
medical centers were recruited as reviewers, and they
blindly scored LLM-generated responses using a 5-point
Likert scale across the three subjective dimensions:
quality, empathy, and potential harm. Objective metrics
including textual features (six metrics across three
linguistic dimensions: lexical, syntactic, and semantic),
response time, and self-improvement capacity were calculated
as additional evaluators for the performance of the two
LLM-chatbots.Results: A total of 954 reviews were generated
for 318 responses to 106 questions. LLM-chatbots achieved
superior scores in quality, empathy, and potential harm as
compared to the online radiologists (all P values <0.001).
Among the LLM-chatbots, DeepSeek-R1 outperformed ChatGPT-4o
in terms of quality scores [DeepSeek-R1: mean 4.40, standard
deviation (SD) 0.57; ChatGPT-4o: mean 3.73, SD 0.64;
P<0.001]. The response times were significantly shorter for
DeepSeek-R1 [median 56.00 s; interquartile range (IQR),
47–67 s] and ChatGPT-4o (median 12.17 s; IQR,
10.91–15.85 s) as compared to online radiologists (median
6,487.90 s; IQR, 3,530.50–29,061.70 s), and the
LLM-chatbots generated greater textual complexity (as
measured by six metrics across three linguistic dimensions:
lexical, syntactic, and semantic) (all P values <0.001).
Among the two chatbots, ChatGPT-4o generally produced
linguistically simpler responses (all P values <0.001), with
comparatively shorter response times (median 12.17 s; IQR,
10.91–15.85 s), than did DeepSeek-R1 (median 56.00 s; IQR,
47–67 s) across various topics (P<0.001). Additionally,
both LLM-chatbots demonstrated a degree of self-improvement
ability.Conclusions: These findings highlight the potential
utility of LLM-chatbots in addressing the common
radiology-related inquiries initially posed by patients.
However, further optimization and validation are required to
establish this emerging technology as a productive and
effective pathway in medical counseling.},
cin = {AG Herms},
ddc = {610},
cid = {I:(DE-2719)1110001},
pnm = {352 - Disease Mechanisms (POF4-352)},
pid = {G:(DE-HGF)POF4-352},
typ = {PUB:(DE-HGF)16},
doi = {10.21037/qims-2025-1716},
url = {https://pub.dzne.de/record/285039},
}