% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Ji:285039,
      author       = {Ji, Jiang and Li, Chenguang and Fu, Yibin and Zhao, Zihao
                      and Wu, Yiyang and Liang, Changhua and Wu, Yue},
      title        = {{C}omparison of online radiologists and large language
                      model chatbots in responding to common radiology-related
                      questions in {C}hinese: a cross-sectional comparative
                      analysis},
      journal      = {Quantitative imaging in medicine and surgery},
      volume       = {16},
      number       = {2},
      issn         = {2223-4292},
      address      = {Hong Kong},
      publisher    = {AME Publ.},
      reportid     = {DZNE-2026-00163},
      pages        = {129},
      year         = {2026},
      abstract     = {Background: Additional avenues for medical counseling are
                      needed to better serve patients. In handling medical
                      counseling, large language model chatbots (LLM-chatbots)
                      have demonstrated near-physician expertise in comprehending
                      enquiries and providing professional advice. However, their
                      performance in addressing patients’ common
                      radiology-related concerns has yet to be evaluated. This
                      study thus aimed to investigate the effectiveness and model
                      performance of LLM-chatbots (DeepSeek-R1 and ChatGPT-4o) in
                      radiology-related medical consultation in the Chinese
                      context through both subjective evaluations and objective
                      metrics.Methods: In this cross-sectional study, common
                      radiology-related questions were collected from the HaoDF
                      online platform, one of the largest Chinese public
                      healthcare service platforms. All questions were posed to
                      the LLM-chatbots from February 24 to February 30, 2025. To
                      facilitate comparison between LLM-chatbots and online
                      radiologists, three senior radiologists from different
                      medical centers were recruited as reviewers, and they
                      blindly scored LLM-generated responses using a 5-point
                      Likert scale across the three subjective dimensions:
                      quality, empathy, and potential harm. Objective metrics
                      including textual features (six metrics across three
                      linguistic dimensions: lexical, syntactic, and semantic),
                      response time, and self-improvement capacity were calculated
                      as additional evaluators for the performance of the two
                      LLM-chatbots.Results: A total of 954 reviews were generated
                      for 318 responses to 106 questions. LLM-chatbots achieved
                      superior scores in quality, empathy, and potential harm as
                      compared to the online radiologists (all P values <0.001).
                      Among the LLM-chatbots, DeepSeek-R1 outperformed ChatGPT-4o
                      in terms of quality scores [DeepSeek-R1: mean 4.40, standard
                      deviation (SD) 0.57; ChatGPT-4o: mean 3.73, SD 0.64;
                      P<0.001]. The response times were significantly shorter for
                      DeepSeek-R1 [median 56.00 s; interquartile range (IQR),
                      47–67 s] and ChatGPT-4o (median 12.17 s; IQR,
                      10.91–15.85 s) as compared to online radiologists (median
                      6,487.90 s; IQR, 3,530.50–29,061.70 s), and the
                      LLM-chatbots generated greater textual complexity (as
                      measured by six metrics across three linguistic dimensions:
                      lexical, syntactic, and semantic) (all P values <0.001).
                      Among the two chatbots, ChatGPT-4o generally produced
                      linguistically simpler responses (all P values <0.001), with
                      comparatively shorter response times (median 12.17 s; IQR,
                      10.91–15.85 s), than did DeepSeek-R1 (median 56.00 s; IQR,
                      47–67 s) across various topics (P<0.001). Additionally,
                      both LLM-chatbots demonstrated a degree of self-improvement
                      ability.Conclusions: These findings highlight the potential
                      utility of LLM-chatbots in addressing the common
                      radiology-related inquiries initially posed by patients.
                      However, further optimization and validation are required to
                      establish this emerging technology as a productive and
                      effective pathway in medical counseling.},
      cin          = {AG Herms},
      ddc          = {610},
      cid          = {I:(DE-2719)1110001},
      pnm          = {352 - Disease Mechanisms (POF4-352)},
      pid          = {G:(DE-HGF)POF4-352},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.21037/qims-2025-1716},
      url          = {https://pub.dzne.de/record/285039},
}