<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e73698</article-id><article-id pub-id-type="doi">10.2196/73698</article-id><article-categories><subj-group subj-group-type="heading"><subject>Letter to the Editor</subject></subj-group></article-categories><title-group><article-title>Authors&#x2019; Reply: Citation Accuracy Challenges Posed by Large Language Models</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Temsah</surname><given-names>Mohamad-Hani</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Al-Eyadhy</surname><given-names>Ayman</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jamal</surname><given-names>Amr</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Alhasan</surname><given-names>Khalid</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Malki</surname><given-names>Khalid H</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Pediatric Department, College of Medicine, King Saud University</institution><addr-line>King Abdullah Road</addr-line><addr-line>Riyadh</addr-line><country>Saudi Arabia</country></aff><aff id="aff2"><institution>Department of Family and Community Medicine, King Saud University Medical City</institution><addr-line>Riyadh</addr-line><country>Saudi Arabia</country></aff><aff id="aff3"><institution>Research Chair of Voice, Swallowing, and Communication Disorders, Department of Otolaryngology-Head and Neck Surgery, College of Medicine, King Saud University</institution><addr-line>Riyadh</addr-line><country>Saudi Arabia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Nedunchezhiyan</surname><given-names>Surya</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mohamad-Hani Temsah, MD, Pediatric Department, College of Medicine, King Saud University, King Abdullah Road, Riyadh, 11424, Saudi Arabia, 966 114692002; <email>mtemsah@ksu.edu.sa</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>2</day><month>4</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e73698</elocation-id><history><date date-type="received"><day>10</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>12</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Mohamad-Hani Temsah, Ayman Al-Eyadhy, Amr Jamal, Khalid Alhasan, Khalid H Malki. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 2.4.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e73698"/><related-article related-article-type="commentary article" ext-link-type="doi" xlink:href="10.2196/72998" xlink:title="Comment on" xlink:type="simple">https://mededu.jmir.org/2025/1/e72998</related-article><related-article related-article-type="commentary article" ext-link-type="doi" xlink:href="10.2196/63400" xlink:title="Comment on" xlink:type="simple">https://mededu.jmir.org/2025/1/e63400</related-article><kwd-group><kwd>ChatGPT</kwd><kwd>Gemini</kwd><kwd>DeepSeek</kwd><kwd>medical education</kwd><kwd>AI</kwd><kwd>artificial intelligence</kwd><kwd>Saudi Arabia</kwd><kwd>perceptions</kwd><kwd>medical students</kwd><kwd>faculty</kwd><kwd>LLM</kwd><kwd>chatbot</kwd><kwd>qualitative study</kwd><kwd>thematic analysis</kwd><kwd>satisfaction</kwd><kwd>RAG retrieval-augmented generation</kwd></kwd-group></article-meta></front><body><p>We appreciate the thoughtful critique of our manuscript &#x201C;Perceptions and earliest experiences of medical students and faculty with ChatGPT in medical education: qualitative study<italic>&#x201D;</italic> [<xref ref-type="bibr" rid="ref1">1</xref>] by Zhao and Zhang [<xref ref-type="bibr" rid="ref2">2</xref>]. Concerns over the generation of hallucinated citations by large language models (LLMs), such as OpenAI&#x2019;s ChatGPT, Google&#x2019;s Gemini, and Hangzhou&#x2019;s DeepSeek, warrant exploring advanced and novel methodologies to ensure citation accuracy and overall output integrity [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>The LLMs have demonstrated a propensity to generate well&#x2010;formatted yet fictitious references&#x2014;a limitation largely attributed to restricted access to subscription-based databases and their reliance on probabilistic text generation [<xref ref-type="bibr" rid="ref4">4</xref>]. As LLMs evolve, future iterations may integrate more reliable retrieval-based architectures, enhancing their capacity to cite legitimate sources while reducing fabricated references [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, until such improvements are systematically validated, scholars must remain cautious.</p><p>One suggested enhancement is using retrieval-augmented generation (RAG) [<xref ref-type="bibr" rid="ref6">6</xref>]. This approach integrates up-to-date external information, substantially improving real-world applicability. However, even RAG-based systems can misinterpret or distort source content under high-trust conditions. To address this, the authors developed Hallucination-Aware Tuning (HAT) [<xref ref-type="bibr" rid="ref6">6</xref>]. HAT trains dedicated detection models to generate labels and detailed descriptions of identified hallucinations. These descriptions are then used by GPT-4 to correct discrepancies. The combination of corrected and original outputs forms a preference dataset that, when used for Direct Preference Optimization training, yields LLMs with reduced hallucination rates and improved answer quality [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>We also propose another solution aimed at fundamentally reducing citation errors: the development of &#x201C;Reference-Accurate&#x201D; academic LLM by major global publishers. Leading journals could develop their own specialized LLM, trained exclusively on rigorously verified academic literature from robust databases. This targeted training would ensure that every generated reference is accurate and directly traceable to published work. Ideally, these publisher-backed LLMs would be made freely available to promote open science.</p><p>Therefore, we recommend a dual approach that combines advanced RAG methodologies with publisher-developed academic LLMs. Comparative studies should be conducted to evaluate the citation accuracy, factual consistency, and overall performance of RAG-HAT-tuned models against these publisher-specific models. Collaborative efforts among academic institutions, publishers, and AI developers are essential to establish standardized protocols and reliable training datasets. Such partnerships would not only enhance the reliability of LLM-generated outputs but also foster greater trust in AI-assisted scholarly communication.</p><p>Moreover, the broader academic community bears responsibility for critically appraising AI-generated content. While LLMs can streamline information retrieval and synthesis, human oversight remains indispensable for safeguarding academic integrity. Rather than dismissing AI-driven tools due to their current flaws, we advocate for further research to ensure greater alignment with evidence-based scholarship and authentic publications. Future LLM iterations may rapidly overcome these limitations, but until then, transparency, responsible usage, and ongoing improvements in AI training remain imperative.</p><p>In conclusion, while RAG augmented by HAT represents a potential advancement in reducing hallucinations, the development of specialized, reference-accurate academic LLMs by publishers may offer a promising pathway. By integrating both strategies and ensuring human oversight, the academic community can ensure that AI-driven tools reliably support the rigor and transparency essential to scholarly research.</p></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">HAT</term><def><p>Hallucination-Aware Tuning</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abouammoh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Alhasan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Aljamaan</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Perceptions and earliest experiences of medical students and faculty with ChatGPT in medical education: qualitative study</article-title><source>JMIR Med Educ</source><year>2025</year><month>02</month><day>20</day><volume>11</volume><fpage>e63400</fpage><pub-id pub-id-type="doi">10.2196/63400</pub-id><pub-id pub-id-type="medline">39977012</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>T</given-names> </name></person-group><article-title>Citation accuracy challenges posed by large language models</article-title><source>JMIR Med Educ</source><year>2025</year><comment><ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/2025/1/e72998">https://mededu.jmir.org/2025/1/e72998</ext-link></comment><pub-id pub-id-type="doi">10.2196/72998</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Temsah</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alhasan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altamimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>DeepSeek in healthcare: revealing opportunities and steering challenges of a new open-source artificial intelligence frontier</article-title><source>Cureus</source><year>2025</year><month>02</month><volume>17</volume><issue>2</issue><fpage>e79221</fpage><pub-id pub-id-type="doi">10.7759/cureus.79221</pub-id><pub-id pub-id-type="medline">39974299</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aljamaan</surname><given-names>F</given-names> </name><name name-style="western"><surname>Temsah</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Altamimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Reference hallucination score for medical artificial intelligence chatbots: development and usability study</article-title><source>JMIR Med Inform</source><year>2024</year><month>07</month><day>31</day><volume>12</volume><fpage>e54345</fpage><pub-id pub-id-type="doi">10.2196/54345</pub-id><pub-id pub-id-type="medline">39083799</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Howard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hope</surname><given-names>W</given-names> </name><name name-style="western"><surname>Gerada</surname><given-names>A</given-names> </name></person-group><article-title>ChatGPT and antimicrobial advice: the end of the consulting infection doctor?</article-title><source>Lancet Infect Dis</source><year>2023</year><month>04</month><volume>23</volume><issue>4</issue><fpage>405</fpage><lpage>406</lpage><pub-id pub-id-type="doi">10.1016/S1473-3099(23)00113-5</pub-id><pub-id pub-id-type="medline">36822213</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>RAG-HAT: a hallucination-aware tuning pipeline for LLM in retrieval-augmented generation</article-title><year>2024</year><conf-name>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-loc>Miami, Florida, US</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-industry.113</pub-id></nlm-citation></ref></ref-list></back></article>