<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e58375</article-id><article-id pub-id-type="doi">10.2196/58375</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Plug-In Augmented ChatGPT and Its Ability to Quantify Uncertainty: Simulation Study on the German Medical Board Examination</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Madrid</surname><given-names>Julian</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Diehl</surname><given-names>Philipp</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Selig</surname><given-names>Mischa</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rolauffs</surname><given-names>Bernd</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hans</surname><given-names>Felix Patricius</given-names></name><degrees>MD, MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Busch</surname><given-names>Hans-J&#x00F6;rg</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Scheef</surname><given-names>Tobias</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Benning</surname><given-names>Leo</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Cardiology, Pneumology, Angiology, Acute Geriatrics and Intensive Care, Ortenau Klinikum</institution><addr-line>Klosterstrasse 18</addr-line><addr-line>Lahr</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Faculty of Medicine, University of Freiburg</institution><addr-line>Freiburg</addr-line><country>Germany</country></aff><aff id="aff3"><institution>G.E.R.N. Research Center for Tissue Replacement, Regeneration and Neogenesis, Department of Orthopedics and Trauma Surgery, University of Freiburg</institution><addr-line>Freiburg</addr-line><country>Germany</country></aff><aff id="aff4"><institution>University Emergency Center, Medical Center, University of Freiburg</institution><addr-line>Freiburg</addr-line><country>Germany</country></aff><aff id="aff5"><institution>Department of Diagnostic and Interventional Radiology, Medical Center, University of Freiburg</institution><addr-line>Freiburg</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gill</surname><given-names>Gerard</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Nakao</surname><given-names>Takahiro</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Julian Madrid, MD, Department of Cardiology, Pneumology, Angiology, Acute Geriatrics and Intensive Care, Ortenau Klinikum, Klosterstrasse 18, Lahr, 77933, Germany, 49 7821932403; <email>julian.madrid@ortenau-klinikum.de</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>21</day><month>3</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e58375</elocation-id><history><date date-type="received"><day>14</day><month>03</month><year>2024</year></date><date date-type="rev-recd"><day>29</day><month>07</month><year>2024</year></date><date date-type="accepted"><day>23</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9; Julian Madrid, Philipp Diehl, Mischa Selig, Bernd Rolauffs, Felix Patricius Hans, Hans-J&#x00F6;rg Busch, Tobias Scheef, Leo Benning. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 21.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e58375"/><abstract><sec><title>Background</title><p>The GPT-4 is a large language model (LLM) trained and fine-tuned on an extensive dataset. After the public release of its predecessor in November 2022, the use of LLMs has seen a significant spike in interest, and a multitude of potential use cases have been proposed. In parallel, however, important limitations have been outlined. Particularly, current LLMs encounter limitations, especially in symbolic representation and accessing contemporary data. The recent version of GPT-4, alongside newly released plugin features, has been introduced to mitigate some of these limitations.</p></sec><sec><title>Objective</title><p>Before this background, this work aims to investigate the performance of GPT-3.5, GPT-4, GPT-4 with plugins, and GPT-4 with plugins using pretranslated English text on the German medical board examination. Recognizing the critical importance of quantifying uncertainty for LLM applications in medicine, we furthermore assess this ability and develop a new metric termed &#x201C;confidence accuracy&#x201D; to evaluate it.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used GPT-3.5, GPT-4, GPT-4 with plugins, and GPT-4 with plugins and translation to answer questions from the German medical board examination. Additionally, we conducted an analysis to assess how the models justify their answers, the accuracy of their responses, and the error structure of their answers. Bootstrapping and CIs were used to evaluate the statistical significance of our findings.</p></sec><sec sec-type="results"><title>Results</title><p>This study demonstrated that available GPT models, as LLM examples, exceeded the minimum competency threshold established by the German medical board for medical students to obtain board certification to practice medicine. Moreover, the models could assess the uncertainty in their responses, albeit exhibiting overconfidence. Additionally, this work unraveled certain justification and reasoning structures that emerge when GPT generates answers.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The high performance of GPTs in answering medical questions positions it well for applications in academia and, potentially, clinical practice. Its capability to quantify uncertainty in answers suggests it could be a valuable artificial intelligence agent within the clinical decision-making loop. Nevertheless, significant challenges must be addressed before artificial intelligence agents can be robustly and safely implemented in the medical domain.</p></sec></abstract><kwd-group><kwd>medical education</kwd><kwd>artificial intelligence</kwd><kwd>generative AI</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>ChatGPT</kwd><kwd>GPT-4</kwd><kwd>board licensing examination</kwd><kwd>professional education</kwd><kwd>examination</kwd><kwd>student</kwd><kwd>experimental</kwd><kwd>bootstrapping</kwd><kwd>confidence interval</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The GPT&#x2014;recently updated to its fourth iteration (GPT-4)&#x2014;is a generative and autoregressive large language model (LLM). It is pretrained on a vast corpus of internet text and fine-tuned on a labeled dataset using a transformer architecture [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. GPT generates coherent and contextually appropriate text. It likely discovered a semantic grammar of language (ie, semantic regularities), enabling it to construct semantically and syntactically correct sentences [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, GPT does not perform meaningful computations on symbolic representations [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. The Wolfram language, a Turing-complete computational language, in contrast, allows such symbolic representation. GPT and the Wolfram language combined hence cover 2 different aspects of human cognition [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Combining these features, particularly when computation and symbolic representations are needed, represents a significant step toward general artificial intelligence (AI). This combination has already been successfully used to examine contradictions in Einstein Special Theory of Relativity equations [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>In the light of these technological advances, LLMs show increasing promise in supporting medical training and practice. However, the models must acquire an in-depth and accurate representation of medical knowledge to be used in these sensitive domains. A medical board examination exemplifies these domains well, as it determines the qualification of medical students to obtain their license to practice medicine.</p><p>Our primary outcome is the model&#x2019;s ability to achieve the minimum required score for passing the 2 written parts of the German medical licensing examination. This task poses a different challenge to an LLM than medical board examinations in the English language [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], as the performance of such models in other languages and in combination with more recent GPT versions and available plugins has not been explored. In the medical field, where mistakes can have severe consequences, assessing the amount of uncertainty is of paramount importance [<xref ref-type="bibr" rid="ref14">14</xref>]. It is therefore crucial to gain insights into the depth and structure the LLMs have of the medical knowledge representation and where its limitations lie [<xref ref-type="bibr" rid="ref15">15</xref>]. Hence, our secondary outcomes were the total correct answer rates, the presence of logical justification of the answer, the presence of information internal to the question, the presence of information external to the question, the confidence GPT displays in its answers, the difficulty of the question, information errors, logical errors, reasoning errors, and the correctness of a second try answer when the first answer was wrong. Insights into these 2 dimensions of outcomes can contribute to facilitating a meaningful use of novel LLM technologies in the medical domain.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Medical Board Examination Dataset</title><p>The German medical board examination consists of 3 steps. The first board examination, taken after 2 years of study, primarily covers basic natural sciences. It comprises 320 questions, which students answer over 2 consecutive days. The second board examination takes place after 6 years of study. It likewise consists of 320 medical questions, which students answer over 3 consecutive days. The third board examination, also after 6 years of study, is an oral examination and was, hence, excluded from this study. The German medical board examination takes place biannually, once in spring and once in fall. As a representative sample, we used the medical board examination from spring 2023. We excluded questions the medical board examination committee deemed inconsistent with the medical literature in the regular post examination review of the content. Additionally, we did not include questions displaying images, as GPT models could not analyze them at the time of our analysis. Furthermore, LLMs are not able to analyze images, GPT4vision which became broadly accessible in the second half of 2023 combines computer vision algorithms&#x2014;which generate a text description of images&#x2014;and LLMs to analyze this text. All questions and answers were exported from AMBOSS SE, a German medical education content creator and service provider.</p></sec><sec id="s2-2"><title>GPT Models and Prompt Engineering</title><p>We evaluated several GPT models with varying characteristics using OpenAI&#x2019;s web interface. The models tested included GPT-3.5, GPT-4, GPT-4 integrated with the Wolfram, ScholarAI, and Web Request (WeGPT.ai) plugins, and GPT-4 integrated with the Wolfram, ScholarAI, Web Request plugins, and an additional feature for translating German inputs into English. We did not investigate earlier versions of GPT as they demonstrated lower performance in a similar study on the American medical board examination [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Creating a precise and adequate context is crucial for generating expected results [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Thus, we aimed to be as specific as possible, simulating the context of a medical student taking the medical board examination. The prompts hence included the request to answer each respective question with 5 possible answers, where only 1 answer was correct. We asked the models to justify their choices based on the provided patient case information, and to estimate their confidence in the answer&#x2019;s accuracy as a percentage of maximal confidence (ie, 100%). If the selected answer was incorrect, the GPT models were asked to explain their mistake in a second attempt. For the GPT-4 model with plugin integration, we asked the model to use the available plugins (Wolfram, ScholarAI, and Web Request). For the GPT-4 model with plugin integration and English translation, we first asked the model to translate the input into the English language, and then to use the translated text to perform the abovementioned tasks. All used prompts are available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-3"><title>Model Testing and Outcome Parameters</title><p>For each GPT model, we used the appropriate prompt followed by the question and the possible answers. The investigators then analyzed the GPT&#x2019;s answer to assess the defined primary and secondary outcomes, which were either binary or in proportions. In cases of uncertainty, the investigators (JM, TS, and LB) convened to resolve the issue.</p><p>First, the correctness of the answer was recorded (binary variable), followed by the presence of logical justification, the presence of information internal to the question, and the presence of information external to the question (binary variables).</p><p>Next, we recorded the model&#x2019;s confidence in its answer (proportion), and the difficulty of the question, derived from the number of students who answered correctly on the AMBOSS platform (proportion).</p><p>To enhance our understanding of where GPT models falter, we sought to classify potential errors. As literature on error types is limited, we conducted a formal analysis to determine distinctive error types and established a formal definition. We propose a classification into 3 categories: information error, logical error, and reasoning error.</p><p>The GPT response can be formalized as &#x201C;answer A&#x201D; is given &#x201C;link&#x201D; because of &#x201C;information B.&#x201D; There are only three possibilities for errors: (1) &#x201C;answer A&#x201D; is incorrect because &#x201C;information B&#x201D; is incorrect&#x2014;termed an information error; (2) &#x201C;answer A&#x201D; is incorrect while &#x201C;information B&#x201D; is correct, but the link between them is incorrect&#x2014;termed a logical error; (3) &#x201C;answer A&#x201D; is incorrect, &#x201C;information B&#x201D; is correct, and the link between &#x201C;answer A&#x201D; and &#x201C;information B&#x201D; is correct&#x2014;termed a reasoning error (<xref ref-type="fig" rid="figure1">Figure 1</xref>). If the answer provided was incorrect, the investigator informed the GPT of its faulty answer, recorded whether it understood its mistake, and provided the correct answer in a second attempt. In the models with integrated plugin use, the active use of plugins was documented for Wolfram, ScholarAI, and Web Requests (binary variables).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Formal definition of error types; we propose a classification into 3 categories: information error, logical error, and reasoning error. The GPT response can be formalized as &#x201C;answer A&#x201D; is given &#x201C;link&#x201D; because of &#x201C;information B.&#x201D; There are only three possibilities for errors: (1) &#x201C;answer A&#x201D; is incorrect because &#x201C;information B&#x201D; is incorrect&#x2014;termed an information error; (2) &#x201C;answer A&#x201D; is incorrect while &#x201C;information B&#x201D; is correct, but the link between them is incorrect&#x2014;termed a logical error; and (3) &#x201C;answer A&#x201D; is incorrect, &#x201C;information B&#x201D; is correct, and the link between &#x201C;answer A&#x201D; and &#x201C;information B&#x201D; is correct&#x2014;termed a reasoning error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58375_fig01.png"/></fig></sec><sec id="s2-4"><title>Data Analysis</title><p>Summary statistics were calculated for the outcome variables (<xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref>). Dichotomous variables were represented by frequency and proportions with 95% CIs, while continuous variables were expressed as mean values with 95% CIs. Uncertainty calculations displayed as 95% CIs were computed via bootstrapping [<xref ref-type="bibr" rid="ref18">18</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of GPT model answers (N=541).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">GPT-4 + plugin</td><td align="left" valign="top">GPT-4 + plugin + translation</td></tr></thead><tbody><tr><td align="left" valign="top">Correct answer (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">373 (0.69&#x00B1;0.65 to 0.73)</td><td align="char" char="." valign="top">493 (0.91&#x00B1;0.89 to 0.93)</td><td align="char" char="." valign="top">493 (0.91&#x00B1;0.89 to 0.94)</td><td align="char" char="." valign="top">486 (0.9&#x00B1;0.87 to 0.92)</td></tr><tr><td align="left" valign="top">Logical justification (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">479 (0.89&#x00B1;0.86 to 0.91)</td><td align="char" char="." valign="top">526 (0.97&#x00B1;0.96 to 0.98)</td><td align="char" char="." valign="top">529 (0.98&#x00B1;0.96 to 0.99)</td><td align="char" char="." valign="top">527 (0.97&#x00B1;0.96 to 0.99)</td></tr><tr><td align="left" valign="top">Question&#x2019;s difficulty mean (&#x00B1;95% CI)</td><td align="char" char="." valign="top">0.288 (0.272 to 0.303)</td><td align="char" char="." valign="top">0.288 (0.272 to 0.303)</td><td align="char" char="." valign="top">0.288 (0.272 to 0.303)</td><td align="char" char="." valign="top">0.288 (0.272 to 0.303)</td></tr><tr><td align="left" valign="top">Error overall (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">168 (0.31&#x00B1;0.27 to 0.35)</td><td align="char" char="." valign="top">48 (0.09&#x00B1;0.07 to 0.11)</td><td align="char" char="." valign="top">48 (0.09&#x00B1;0.06 to 0.11)</td><td align="char" char="." valign="top">55 (0.1&#x00B1;0.08 to 0.13)</td></tr><tr><td align="left" valign="top">Presence of internal information (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">521 (0.96&#x00B1;0.95 to 0.98)</td><td align="char" char="." valign="top">537 (0.99&#x00B1;0.98 to 1)</td><td align="char" char="." valign="top">537 (0.99&#x00B1;0.98 to 1)</td><td align="char" char="." valign="top">537 (0.99&#x00B1;0.98 to 1)</td></tr><tr><td align="left" valign="top">Presence of external information (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">538 (0.99&#x00B1;0.99 to 1)</td><td align="char" char="." valign="top">540 (1&#x00B1;0.99 to 1)</td><td align="char" char="." valign="top">541 (1&#x00B1;1 to 1)</td><td align="char" char="." valign="top">541 (1&#x00B1;1 to 1)</td></tr><tr><td align="left" valign="top">Information error (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">37 (0.22&#x00B1;0.16 to 0.29)</td><td align="char" char="." valign="top">5 (0.1&#x00B1;0.02 to 0.19)</td><td align="char" char="." valign="top">5 (0.1&#x00B1;0.02 to 0.2)</td><td align="char" char="." valign="top">7 (0.13&#x00B1;0.05 to 0.22)</td></tr><tr><td align="left" valign="top">Logical error (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">61 (0.36&#x00B1;0.29 to 0.43)</td><td align="char" char="." valign="top">18 (0.38&#x00B1;0.25 to 0.52)</td><td align="char" char="." valign="top">12 (0.25&#x00B1;0.125 to 0.375)</td><td align="char" char="." valign="top">19 (0.35&#x00B1;0.22 to 0.47)</td></tr><tr><td align="left" valign="top">Confidence mean (&#x00B1;95% CI)</td><td align="char" char="." valign="top">0.912 (0.904 to 0.918)</td><td align="char" char="." valign="top">0.938 (0.934 to 0.942)</td><td align="char" char="." valign="top">0.919 (0.915 to 0.924)</td><td align="char" char="." valign="top">0.919 (0.915 to 0.923)</td></tr><tr><td align="left" valign="top">Use of plugin Wolfram (proportion&#x00B1;95% CI)</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">N/A</td><td align="char" char="." valign="top">50 (0.09&#x00B1;0.07 to 0.12)</td><td align="char" char="." valign="top">47 (0.09&#x00B1;0.06 to 0.11)</td></tr><tr><td align="left" valign="top">Reasoning error (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">72 (0.42&#x00B1;0.36 to 0.51)</td><td align="char" char="." valign="top">26 (0.54&#x00B1;0.4 to 0.69)</td><td align="char" char="." valign="top">30 (0.63&#x00B1;0.48 to 0.75)</td><td align="char" char="." valign="top">29 (0.53&#x00B1;0.4 to 0.65)</td></tr><tr><td align="left" valign="top">Correct answer in second try (proportion&#x00B1;95% CI)</td><td align="char" char="." valign="top">90 (0.54&#x00B1;0.46 to 0.61)</td><td align="char" char="." valign="top">32 (0.67&#x00B1;0.52 to 0.79)</td><td align="char" char="." valign="top">36 (0.75&#x00B1;0.63 to 0.88)</td><td align="char" char="." valign="top">33 (0.6&#x00B1;0.47 to 0.73)</td></tr><tr><td align="left" valign="top">Use of plugin ScholarAI (proportion&#x00B1;95% CI)</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="char" char="." valign="top">107 (0.2&#x00B1;0.16 to 0.23)</td><td align="char" char="." valign="top">47 (0.09&#x00B1;0.06 to 0.11)</td></tr><tr><td align="left" valign="top">Use of plugin web requests (proportion&#x00B1;95% CI)</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td><td align="char" char="." valign="top">2 (0.003&#x00B1;0 to 0.01)</td><td align="char" char="." valign="top">25 (0.05&#x00B1;0.03 to 0.06)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>N/A: not applicable.</p></fn></table-wrap-foot></table-wrap><p>The primary outcome was determined by comparing the performance of the GPT-4 model, integrated with the plugins and the English translation, to the required passing score for the medical board examination, which is 60%. The difference of proportions was calculated with 95% CI using bootstrapping (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>Subsequently, secondary outcomes were calculated: the final examination rate for each GPT model was compared to both chance and the required passing score for the medical board examination. The difference of proportions was calculated with 95% CI using bootstrapping (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>The proportions of logical justification within the answer, information internal to the answer, and information external to the answer were compared between correct and incorrect responses. The difference of proportions was calculated with 95% CI using bootstrapping (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Analysis of plugin-integrated GPT-4 model answers.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="3"/><td align="left" valign="top">All correct answers (n=493)</td><td align="left" valign="top">All incorrect answers (n=48)</td><td align="left" valign="top">Difference in proportions or Cohen <italic>d</italic> or Pearson <italic>r</italic> (&#x00B1;95% CI)</td><td align="left" valign="top">Confidence accuracy (&#x00B1;95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Comparison of GPT models justifications between correct and incorrect answers</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="6">GPT-4 + plugin (N=541)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Logical justification (proportion &#x00B1;95% CI)</td><td align="char" char="." valign="top">493 (1&#x00B1;1 to 1)</td><td align="char" char="." valign="top">36 (0.75&#x00B1;0.63 to 0.88)</td><td align="char" char="." valign="top">0.25 (0.13 to 0.38)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Internal information (proportion &#x00B1;95% CI)</td><td align="char" char="." valign="top">489 (0.99&#x00B1;0.983 to 998)</td><td align="char" char="." valign="top">48 (1&#x00B1;1 to 1)</td><td align="char" char="." valign="top">0 (-0.01 to 0)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">Comparison of GPT models justifications between correct and incorrect answers</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">External information (proportion &#x00B1;95% CI)</td><td align="char" char="." valign="top">493 (1&#x00B1;1 to 1)</td><td align="char" char="." valign="top">48 (1&#x00B1;1 to 1)</td><td align="char" char="." valign="top">0 (0 to 0)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">Confidence of GPT models compared between correct and incorrect answers</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="6">GPT-4 + plugin (N=541)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Confidence mean (&#x00B1;95% CI)</td><td align="char" char="." valign="top">0.923 (0.918 to 0.928)</td><td align="char" char="." valign="top">0.886 (0.87 to 0.901)</td><td align="char" char="." valign="top">-0.69 (-0.99 to -0.39)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="char" char="." valign="top">0.037 (0.021 to 0.053)</td></tr><tr><td align="left" valign="top" colspan="7">Comparison of question&#x2019;s difficulty of GPT models between correct and incorrect answers</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="6">GPT-4 + plugin (N=541)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Question&#x2019;s difficulty mean (&#x00B1;95% CI)</td><td align="char" char="." valign="top">0.279 (0.263 to 0.295)</td><td align="char" char="." valign="top">0.379 (0.327 to 0.438)</td><td align="char" char="." valign="top">0.57 (0.27 to 0.86)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">Correlation of confidence and question&#x2019;s difficulty for all answers</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="4">GPT-4 + plugin (N=541)</td><td align="char" char="." valign="top">-0.0874 (-0.176 to 0.004)<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Confidence mean (&#x00B1;95% CI)</td><td align="char" char="." valign="top">0.920 (0.916 to 0.924)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top"/><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Question&#x2019;s difficulty mean (&#x00B1;95% CI)</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">0.288 (0.273 to 0.304)</td><td align="left" valign="top"/><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">Comparison of correct answers between GPT models (N=541)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="6">GPT-4 + plugin vs GPT-3.5</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Correct answer rate (proportion &#x00B1;95% CI)</td><td align="char" char="." valign="top">373 (0.69&#x00B1;0.65 to 0.73)</td><td align="char" char="." valign="top">493 (0.91&#x00B1;0.89 to 0.94)</td><td align="char" char="." valign="top">0.22 (0.18 to 0.27)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="6">GPT-4 + plugin vs GPT-4</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Correct answer rate (proportion &#x00B1;95% CI)</td><td align="char" char="." valign="top">493 (0.91&#x00B1;0.89 to 0.94)</td><td align="char" char="." valign="top">493 (0.91&#x00B1;0.89 to 0.94)</td><td align="char" char="." valign="top">0 (-0.03 to 0.03)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="6">GPT-4 + plugin vs GPT-4 + plugin + translation</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Correct answer rate (proportion &#x00B1;95% CI)</td><td align="char" char="." valign="top">493 (0.91&#x00B1;0.89 to 0.94)</td><td align="char" char="." valign="top">486 (0.9&#x00B1;0.87 to 0.92)</td><td align="char" char="." valign="top">-0.01 (-0.05 to 0.02)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Difference in proportions.</p></fn><fn id="table2fn2"><p><sup>b</sup>Not available.</p></fn><fn id="table2fn3"><p><sup>c</sup>Cohen <italic>d</italic>.</p></fn><fn id="table2fn4"><p><sup>d</sup>Pearson <italic>r</italic>.</p></fn></table-wrap-foot></table-wrap><p>The model&#x2019;s confidence in its answers was compared between correct and incorrect responses. Additionally, the relationship between the model&#x2019;s confidence in its answers and the difficulty of the question was assessed. Cohen <italic>d</italic> values and 95% CI were computed using a linear regression model and bootstrapping (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app6">Multimedia Appendices 6</xref> and <xref ref-type="supplementary-material" rid="app7">7</xref>).</p><p>To evaluate the accuracy of the model&#x2019;s confidence in its answers, we developed a parameter termed confidence accuracy (CA). It is defined as follows:</p><p>CA = (confidence of correct answers in percentage &#x2013; confidence of incorrect answers in percentage)/100</p><p>Consequently, this parameter can take values from &#x2212;1 to 1, where 1 accurately reflects the model&#x2019;s uncertainty, 0 indicates no ability to quantify uncertainty, and &#x2212;1 suggests incorrect quantification.</p><p>The difficulty of the question was assessed using real correct response proportions from students available on the AMBOSS platform. The difficulty was assessed as follows:</p><p>Difficulty=1 &#x2013; correct answer proportion</p><p>Then, the difficulty of the question was compared between correct and incorrect answers, with Cohen <italic>d</italic> calculated using a linear regression model (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>).</p><p>Furthermore, we compared the proportion of correct answers between models (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>).</p><p>We compared the proportion of correct answers in the GPT-4 models with the proportion of correct answers in the answers where a plugin has been used. We compared the proportion of plugin usage in GPT models with German and English input. We compared the confidence of the model when using plugins to the confidence of the model overall. We compared the proportion of correct answers when averaging the 4 different models to each model in particular (<xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>).</p><p>In instances where questions were accompanied by images, GPT models sometimes responded by describing the image, although the models could not access the respective images. This phenomenon is known as a type of hallucination [<xref ref-type="bibr" rid="ref19">19</xref>]. Therefore, we compared the proportion of hallucinations present in each model when answering questions, including image questions. We calculated the proportion of correct answers for each model when keeping the questions with pictures (<xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>).</p><p>We compared the different error proportions between different models. We compared the proportion of logical errors when using the Wolfram plugin to the proportion of errors when using the entire model. We compared correct second-try answers between different models (<xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>).</p><p>The 95% CIs were calculated using bootstrapping. Where necessary, parametric assumptions were tested using quantile-quantile plots for normality and Levene tests for the homogeneity of variances. The independence of question answers was assumed. All statistical analyses were performed in RStudio (version 2023.06.0+421). The significance level for all tests was set a priori at 95% CI.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>All tests were performed on the 541 questions of the German medical board examination from spring 2023. Sub analyses were performed on other subgroups, the respective sample sizes are indicated in the appropriate tables. All results for GPT-3.5, GPT-4, GPT-4 + plugin (GPT4P), and GPT-4 + plugin + translation (GPT4PT) are listed in full detail in the tables and the supplementary materials. To ensure legibility, only relevant results are addressed in the results section.</p><p>Descriptive statistics with CIs for the first board examination, second board examination, and the overall examination are displayed in <xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref>.</p><p>All models performed significantly better than chance. Furthermore, all GPT models were significantly better than the required proportion to pass the final medical board examination.</p><p>All GPT models had a significantly higher proportion of providing a logical justification for correct answers compared to incorrect answers (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). Yet, there was no statistical significance for the proportion of used internal information for correct and incorrect answers (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). Similarly, there was no statistical significance for the proportion of used external information for correct and incorrect answers (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p><p>Although generally high for both incorrect and correct answers, models had a confidence mean which was significantly higher for correct answers than incorrect answers (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). This is reflected in CA values significantly different from zero: GPT-3.5 (0.028, 95% CI 0.011 to 0.048), GPT-4 (0.041, 95% CI 0.023 to 0.062), GPT4P (0.037, 95% CI 0.021 to 0.053), and GPT4PT (0.043, 95% CI 0.028 to 0.059).</p><p>From all models, only GPT4P made significantly more reasoning errors than logical errors (0.37, 95% CI 0.125 to 0.60). All models made significantly more reasoning errors than information errors: GPT-3.5 (0.21, 95% CI 0.11 to 0.30), GPT-4 (0.44, 95% CI 0.27 to 0.60), GPT4P (0.52, 95% CI 0.31 to 0.71), and GPT4PT (0.40, 95% CI 0.20 to 0.58). All models but GPT4P made significantly more logical errors than information errors: GPT-3.5 (0.14, 95% CI 0.029 to 0.26), GPT-4 (0.27, 95% CI 0.10 to 0.44), and GPT4PT (0.22, 95% CI 0.05 to 0.38). GPT-4 (0.12, 95% CI 0.05 to 0.22) and GPT4P (0.12, 95% CI 0.02 to 0.22) made significantly less information errors than GPT3.5.</p><p>The GPT4-based models all performed better than the GPT 3.5 model in providing correct answers as reflected in the difference of correct answer proportions (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>). However, no GPT4-based model was better than another GPT4-based model, as reflected in the difference of correct answer proportions (<xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>).</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Primary Outcome</title><p>All GPT models assessed performed above the minimum required score of 60%. The GPT-4 models performed particularly well, outperforming most students in the given examinations. Specifically, for the first board examination, all GPT-4 models performed better than 98.6% of students. For the second board examination, they surpassed 95.8% of students, as detailed in the records of the examining body [<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>In general, there was a significant gap between GPT-3.5 and the GPT-4 models. The more recent models, with substantially more parameters and the capacity to remember longer prompts, appear to increase the accuracy of responses. However, we observed no additional benefit when GPT-4 models were paired with plugins.</p><p>The use of plugins did not yield a higher proportion of correct answers than the standard model. It is possible that GPT-4 already achieves a very high rate of accuracy, resulting in a ceiling effect. Hence, the addition of plugins may not offer a significant advantage for the questions prompted.</p><p>During our study, we noted that the Wolfram plugin was frequently used for more complex calculations. Yet, in the context of clinically applicable questions, complex mathematical procedures are typically not required and the use of symbolic language is usually not required. Thus, using the Wolfram Alpha plugin is likely more beneficial for questions that involve extensive computations or advanced mathematical problems requiring symbolic representations. The ScholarAI plugin was activated for complex informational queries, but the resulting papers were not consistently useful. Surprisingly, the Internet Access plugin (WeGPT.ai) was the least used. This may be because answering medical questions typically demands expert-level knowledge, and general internet searches do not provide sufficiently specific information. Moreover, since the model has been trained on a vast amount of internet data, it likely already encompasses the knowledge available from the world wide web within its parameters.</p><p>We speculated that posing questions in German might hinder the model&#x2019;s access to the broader body of knowledge available in English. However, this was not the case; the GPT model equipped with translation capabilities did not outperform the GPT-4 models without translation features. The GPT model likely abstracts high-level concepts and is not impeded by the language of the queries. This aligns with the LLMs&#x2019; transformer architecture, which accesses higher-level concepts prior to translating text into another language [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Interestingly, the GPT-4 model with translation invoked plugins less frequently than the model without translation. We hypothesize that plugin calls occur at a lower level in the neural network, making them less necessary in English due to the larger available language corpus. In German, the model might need to delve deeper into the latent representation of concepts not tied to a specific language. However, this remains speculative and warrants further research.</p></sec><sec id="s4-2"><title>Secondary Outcomes</title><p>While all models provided a very high proportion of logical justification for correct answers, it was significantly less extensive for incorrect answers. However, upon further analysis, we did not detect a significant difference in the proportion of internal information from the question in the answer or in the use of external information not contained in the question between correct and incorrect answers. One study already assessed the presence of logical justification in answers to United States Medical Licensing Examination questions, where all answers exhibited logical justification regardless of their accuracy [<xref ref-type="bibr" rid="ref12">12</xref>]. Hence, this metric could not be used as a discriminator for correctness.</p><p>We were unable to demonstrate a significant correlation between the model&#x2019;s confidence in an answer and the difficulty level of the question for humans. This suggests that the model&#x2019;s interpretation of question difficulty differs from that of humans. However, as with humans, the model showed improved performance on easier questions compared to more challenging ones. Thus, it appears that the representation of question difficulty is distinct between LLMs and humans.</p></sec><sec id="s4-3"><title>Conceptual Implications</title><sec id="s4-3-1"><title>Use for Medical Education</title><p>This performance suggests that LLMs such as GPT could assume a greater role in medical education, as their integration could significantly change the conventional approach to medical education, which has traditionally emphasized the acquisition and maintenance of medical knowledge. The emergence of AI agents with superior information retention abilities, however, prompts a reevaluation of our educational focus. In this light, teaching methodologies could shift toward navigating and structuring available information with respective AI agents. The approach could hence shift from retaining information to learning how to efficiently access information and deeply understand these systems, along with their benefits and drawbacks.</p></sec><sec id="s4-3-2"><title>Use in Clinical Practice</title><p>The utility of LLMs is not limited to educational settings but also extends to clinical practice. Although LLMs may not be as effective in highly specialized tasks where dedicated machine learning algorithms excel&#x2014;for instance, XGBoost in identifying pulmonary embolisms [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>] &#x2014; LLMs are highly proficient in text processing and information integration from diverse algorithms [<xref ref-type="bibr" rid="ref25">25</xref>]. This positions them as intelligent medical assistants, capable of transforming complex data into narratives that are comprehensible in a human context. Currently, clinicians have a limited understanding of AI agents and their functions. Clinicians must, therefore, gain a thorough understanding of how various AI agents function, including their strengths and weaknesses.</p><p>With insufficient knowledge on the principles of LLM-based assistants, clinicians are at risk of blindly following such assistant&#x2019;s guidance without fully understanding its operations [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Due to the inherent complexity of LLMs, which often function as a black box, we can only partially monitor their operations at varying levels of complexity and behavior [<xref ref-type="bibr" rid="ref26">26</xref>]. Given the marginal uncertainty intrinsic to such complex models, the AI agent should not supplant clinicians in decision-making, but rather provide additional informed perspectives.</p><p>To serve as a useful assistant, however, the assessment of uncertainty for any output provided by such is crucial. The key attribute enabling this evaluation is the ability to quantify uncertainty, a trait humans are presumed to possess [<xref ref-type="bibr" rid="ref14">14</xref>]. For LLM-based assistants to provide a comparable estimate, a standardized measure is needed to gauge the confidence in an AI agent&#x2019;s output. For binary outcomes such as healthy or diseased, metrics such as specificity, sensitivity, and area under the curve are effective. For more complex queries with multiple potential answers&#x2014;as managed by LLMs&#x2014;traditional measures such as sensitivity and specificity are inadequate. We therefore developed a new metric called &#x201C;confidence accuracy&#x201D; (CA) which correlates the confidence assigned to an answer with its empirical accuracy. This allows for the quantification of uncertainty, crucial for clinical decision-making. Although our work showed that all GPT models have the ability to quantify uncertainty, the expression in percentage does not seem to reflect the confidence for any specific decision (ie, the models were overall largely overconfident). Although statistically different from zero, CA values were consistently close to zero. New LLM methodologies aim to enhance this by incorporating uncertainty estimation [<xref ref-type="bibr" rid="ref28">28</xref>]. Future AI agents should be fine-tuned using the CA metric in order to improve uncertainty quantification, a critical objective for implementing AI as a supportive tool for physicians in clinical environments.</p></sec><sec id="s4-3-3"><title>Identified Errors</title><p>We observed that GPT models commit different types of errors, particularly reasoning errors. Reasoning errors typically occur in situations where multiple options are correct, but one is more critical than the other. GPT models over proportionately make reasoning errors likely because this skill is acquired through human experience and is challenging to learn from text-based web sources. The second most common error type in GPT models was logical errors. Since LLMs use a statistical approach to reconstruct human-written text, we anticipated difficulties with logic and mathematics, which require formal symbolic representation [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. We hypothesized that the Wolfram plugin, using the Wolfram language, would mitigate these challenges. Yet, using the Wolfram plugin did not reduce the number of logical errors. Finally, fewer information errors were observed compared to other error types across all GPT models. This likely reflects the strength of these LLMs, which have assimilated a vast corpus of knowledge. In addition to the 3 error types derived from the informational and logical structure of GPT&#x2019;s answers, there are 2 sources of bias that arise prior to answer generation. First, due to the stochastic nature of token generation, there is likely a stochastic bias inherent in all GPT responses. Second, due to in-context generation conditioned by the prompting strategy, a systematic bias probably occurs as well. We attempted to mitigate the stochastic bias by averaging the results from all models and selecting the most common outcome. However, the performance of such averaged models did not surpass that of the GPT-4 models.</p><p>To assess whether the GPT models could recognize and correct their own mistakes, we prompted them to attempt another answer after providing incorrect responses. In most instances, the model would acknowledge the mistake and provide the correct answer along with a new explanation. This phenomenon could likely be attributed to the differing mechanics of forward and backward reasoning in LLMs. With forward reasoning, the LLM calculates the probability of the next token without a specific reasoning goal [<xref ref-type="bibr" rid="ref29">29</xref>]. In contrast, backward reasoning enables the LLM to better contextualize the information. It is crucial to note, however, that we did not request the model to immediately reassess the answer; instead, we informed it of the answer&#x2019;s incorrectness before asking for a reevaluation [<xref ref-type="bibr" rid="ref29">29</xref>]. Future studies could further investigate the model&#x2019;s ability to self-correct without prior notification of its errors.</p><p>In instances where questions were accompanied by images (ie, the model did not have access to the images), GPT models, particularly GPT-3.5, often responded by describing the image that the model had not actually seen. This unexpected information error, known as a hallucination [<xref ref-type="bibr" rid="ref19">19</xref>], persisted in the GPT-4 models, albeit at a significantly reduced frequency compared to GPT-3.5. Nevertheless, the propensity for overconfidence in entirely fabricated information remains a challenge for the latest generation of LLMs and is a phenomenon not fully understood [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec></sec><sec id="s4-4"><title>Limitations</title><sec id="s4-4-1"><title>Technological Limitations of LLMs</title><p>Although the results were impressive with GPT outperforming most students in the German medical board examination, it is crucial to remember that these models still possess significant limitations. At the time of our data collection, GPT-4 was incapable of interpreting medical images, such as chest x-rays or histological samples. This is a considerable drawback, given that medical information is inherently multimodal, and the ability to integrate multimodal data will be essential for the adoption of such models in academic and clinical settings. It is anticipated that future GPT iterations and other LLMs will be fully multimodal, which necessitates additional research to evaluate their performance across a more diverse array of questions.</p><p>A second concern relates to the stochastic nature of token generation, meaning that answers may vary slightly when questions are posed multiple times [<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>A third concern pertains to the prompt sensitivity of LLMs. This trait can be advantageous as it allows the incorporation of context into the generation of meaningful output and may contribute to the models&#x2019; Bayesian characteristics [<xref ref-type="bibr" rid="ref32">32</xref>]. However, prompt sensitivity also increases the risk of systematic errors with repetitive use of the same prompt. Prompt engineering is a discipline that emerged in trying to minimize systematic errors [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>Within the extensive volume of data available online, there are significant risks of bias. Given that LLMs are trained on vast datasets, there is an inherent risk of adopting biases from the underlying data structures. However, fine-tuning through supervised learning on labeled data can help mitigate these risks [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p></sec><sec id="s4-4-2"><title>Limitations of the Use of LLMs in a Medical Context</title><p>Despite the seemingly immediate promise of using LLMs in both educational and clinical contexts, the current ethical and regulatory environment needs to be considered to advance the use of these novel technologies safely.</p><p>As the representation of medical information of an LLM must not be confused with medical knowledge from a medical professional, it remains crucial to enable students and medical professionals alike to identify LLM-generated outputs as such in order to interpret them very carefully. Different to, for example, a senior medical colleague providing guidance for a clinical decision, an LLM-generated output is neither based on clinical knowledge, nor experience. The risk of such confusion has been described as anthropomorphic projection and efforts for advancing these novel technologies in the medical field need to simultaneously foster the awareness of such phenomena. This differentiation resonates with the provisions of the European Union (EU) on a risk-based assessment approach [<xref ref-type="bibr" rid="ref37">37</xref>] and, more recently, with the Bletchley Declaration [<xref ref-type="bibr" rid="ref38">38</xref>]. The latter emphasizes the risks at the &#x201C;frontier&#x201D; of AI, at which we operate with the presented project.</p><p>While the concerns discussed in the context of medical education&#x2014;and, more widely, training&#x2014;are mainly within the realm of AI ethics, more specific limitations apply to the clinical use of these technologies. At the time of our analysis, no commercially available LLM in the EU&#x2014;including the GPT versions assessed in this work&#x2014;have an assigned intended medical use, a basic regulatory prerequisite for their use in a clinical context. Without such intended medical use, the Medical Device Regulation (MDR), the regulatory framework for medical devices in the EU, is not applicable. Hence, such a device would not be a medical device in the regulatory sense and could, therefore, not be used in a clinical context without irresponsible safety and liability risks. While it is not the user (eg, researchers or clinicians), but the manufacturer (eg, OpenAI for the ChatGPT models) who assigns an intended medical use&#x2014;which itself comes with further regulatory requirements&#x2014;the clinical use of the currently available and mostly all-purpose LLMs remains challenging.</p><p>Yet, even developing an LLM with an intended medical use and fulfilling all adjacent regulatory requirements would&#x2014;as of now&#x2014;not necessarily resolve the challenge centering around the clinical use of such program, as a key requisite for software as a medical device outlined in the MDR (&#x201C;devices that incorporate electronic programmable systems, including software, or software that are devices in themselves, shall be designed to ensure repeatability, reliability and performance in line with their intended use.&#x201D; MDR Annex I, Rule 17.1 [<xref ref-type="bibr" rid="ref39">39</xref>]) is currently considered to be violated, although this question remains subject to debate.</p><p>However, the rapid development of technological advances and the concurrent establishment of respective regulations should not be perceived as a &#x201C;race to get to grips with AI&#x201D; [<xref ref-type="bibr" rid="ref40">40</xref>], but should be viewed as a co-evolution to eventually yield the best population-wide benefit from these technological advances. In this light, the emphasis of a &#x201C;pro-innovation and proportionate governance,&#x201D; as proposed in the Bletchley Declaration, is equally as crucial as the implementation of regulatory frameworks.</p></sec><sec id="s4-4-3"><title>Limitations of This Study</title><p>Our study has several limitations. We used a specific German medical board examination as a sample to represent the general distribution of medical questions. While it is acknowledged that questions evolve over time and may introduce bias, the objective of the medical board examination is to maintain a consistent level of difficulty, reflecting the minimum required knowledge to attain board approval for medical practice. The distribution of student grades has remained relatively stable over time, leading us to believe that this potential bias is minimal. In the model with translation, we used GPT to translate the questions before applying them to the model. Although we did not observe any, it is possible that translation errors occurred, potentially acting as a confounder in this study. In the context of the medical board examination, multiple-choice questions are posed to elicit clear answers that can be quantitatively assessed. By contrast, in a clinical setting, questions tend to be open-ended, which introduces a different dynamic. Nevertheless, we asked the model to justify its answers to glean insight into its computational process, thus rendering the questions more comparable to open-ended inquiries.</p></sec></sec><sec id="s4-5"><title>Conclusion</title><p>The performance of GPT models in the German medical board examination have surpassed both the passing threshold and the performance of most students. While GPT appears to possess a latent representation of uncertainty, it currently exhibits a significant degree of overconfidence. The introduced metric of CA could facilitate the appropriate measurement and fine-tuning of models to improve this aspect. However, there are numerous limitations that clinicians should be aware of. Challenges such as hallucinations, the stochastic nature of token generation, and prompt sensitivity are highlighted, indicating areas for further research and development. Further, we see the remaining open questions regarding the ethical and regulatory use of LLMs in the educational and clinical context, which need to be addressed on a policy level.</p></sec></sec></body><back><fn-group><fn fn-type="con"><p>JM participated in the conceptualization, data acquisition, data curation, formal analysis, investigation, methodology, project administration, software, validation, visualization, writing of the original draft, and review and editing of the writing, and should be considered the first author. PD, MS, BR, FPH, and HJB participated in the methodology and review editing and should be considered as second authors. LB and TS participated in the conceptualization, data acquisition, formal analysis, investigation, methodology, validation, review and editing of the writing, and should be considered last authors. Correspondence should be addressed to JM and LB.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CA</term><def><p>confidence accuracy</p></def></def-item><def-item><term id="abb3">EU</term><def><p>European Union</p></def></def-item><def-item><term id="abb4">GPT4P</term><def><p>GPT-4 + plugin</p></def></def-item><def-item><term id="abb5">GPT4PT</term><def><p>GPT-4 + plugin + translation</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">MDR</term><def><p>Medical Device Regulation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 12, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Korngiebel</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Mooney</surname><given-names>SD</given-names> </name></person-group><article-title>Considering the possibilities and pitfalls of Generative Pre-trained Transformer 3 (GPT-3) in healthcare delivery</article-title><source>NPJ Digit Med</source><year>2021</year><month>06</month><day>3</day><volume>4</volume><issue>1</issue><fpage>93</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00464-x</pub-id><pub-id pub-id-type="medline">34083689</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Sparks of artificial general intelligence: early experiments with GPT-4</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 22, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.12712</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wolfram</surname><given-names>S</given-names> </name></person-group><source>What Is ChatGPT Doing...and Why Does It Work?</source><year>2023</year><publisher-name>Wolfram Media, Inc</publisher-name><pub-id pub-id-type="other">978-1-57955-081-3</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Traylor</surname><given-names>A</given-names> </name><name name-style="western"><surname>Feiman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pavlick</surname><given-names>E</given-names> </name></person-group><article-title>AND does not mean OR: using formal languages to study language models&#x2019; representations</article-title><conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2</conf-name><conf-date>Aug 1-6, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.acl-short.21</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Misra</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rayz</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ettinger</surname><given-names>A</given-names> </name></person-group><article-title>COMPS: conceptual minimal pair sentences for testing robust property knowledge and its inheritance in pre-trained language models</article-title><access-date>2025-02-28</access-date><conf-name>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</conf-name><conf-date>May 2-6, 2023</conf-date><conf-loc>Dubrovnik, Croatia</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.eacl-main">https://aclanthology.org/2023.eacl-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.eacl-main.213</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>N</given-names> </name><name name-style="western"><surname>Linzen</surname><given-names>T</given-names> </name></person-group><article-title>COGS: a compositional generalization challenge based on semantic interpretation</article-title><access-date>2025-02-28</access-date><conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Nov 16-20, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/2020.emnlp-main">https://www.aclweb.org/anthology/2020.emnlp-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.731</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ettinger</surname><given-names>A</given-names> </name></person-group><article-title>What BERT is not: lessons from a new suite of psycholinguistic diagnostics for language models</article-title><source>Trans Assoc Comput Linguist</source><year>2020</year><month>12</month><volume>8</volume><fpage>34</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1162/tacl_a_00298</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Goertzel</surname><given-names>B</given-names> </name></person-group><article-title>Generative AI vs AGI: the cognitive strengths and weaknesses of modern llms</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.10371</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Vzorin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bukinich</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Sedykh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vetrova</surname><given-names>I</given-names> </name><name name-style="western"><surname>Sergienko</surname><given-names>E</given-names> </name></person-group><article-title>Emotional intelligence of GPT-4 large language model</article-title><source>PsyArXiv Preprints</source><comment>Preprint posted online on  Oct 20, 2023</comment><pub-id pub-id-type="doi">10.31234/osf.io/b6vys</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Bryant</surname><given-names>S</given-names> </name></person-group><article-title>Assessing GPT-4&#x2019;s role as a co-collaborator in scientific research: a case study analyzing Einstein&#x2019;s special theory of relativity</article-title><source>Research Square</source><comment>Preprint posted online on  Apr 12, 2023</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-2808494/v1</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? the implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cosmides</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tooby</surname><given-names>J</given-names> </name></person-group><article-title>Are humans good intuitive statisticians after all? Rethinking some conclusions from the literature on judgment under uncertainty</article-title><source>Cognition</source><year>1996</year><month>01</month><volume>58</volume><issue>1</issue><fpage>1</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1016/0010-0277(95)00664-8</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Prompt engineering for healthcare: methodologies and applications</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.14670</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Dolan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Carin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name></person-group><article-title>What makes good in-context examples for GPT-3?</article-title><access-date>2025-02-28</access-date><conf-name>Proceedings of Deep Learning Inside Out (DeeLIO 2022)</conf-name><conf-date>May 27, 2022</conf-date><conf-loc>Dublin, Ireland</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.deelio-1">https://aclanthology.org/2022.deelio-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2022.deelio-1.10</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haukoos</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Lewis</surname><given-names>RJ</given-names> </name></person-group><article-title>Advanced statistics: bootstrapping confidence intervals for statistics with &#x201C;difficult&#x201D; distributions</article-title><source>Acad Emerg Med</source><year>2005</year><month>04</month><volume>12</volume><issue>4</issue><fpage>360</fpage><lpage>365</lpage><pub-id pub-id-type="doi">10.1197/j.aem.2004.11.018</pub-id><pub-id pub-id-type="medline">15805329</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Evaluation and analysis of hallucination in large vision-language models</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 29, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2308.15126</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>Archiv medizin</article-title><source>IMPP</source><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.impp.de/pruefungen/medizin/archiv-medizin.html">https://www.impp.de/pruefungen/medizin/archiv-medizin.html</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Belinkov</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>J</given-names> </name></person-group><article-title>Analysis methods in neural language processing: a survey</article-title><source>Trans Assoc Comput Linguist</source><year>2019</year><month>04</month><day>1</day><volume>7</volume><fpage>49</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.1162/tacl_a_00254</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ryan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Maharjan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mataraso</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Predicting pulmonary embolism among hospitalized patients with machine learning algorithms</article-title><source>Pulm Circ</source><year>2022</year><month>01</month><volume>12</volume><issue>1</issue><fpage>e12013</fpage><pub-id pub-id-type="doi">10.1002/pul2.12013</pub-id><pub-id pub-id-type="medline">35506114</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dua</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>GR</given-names> </name><name name-style="western"><surname>Chotso</surname><given-names>T</given-names> </name><name name-style="western"><surname>Raj</surname><given-names>VFD</given-names> </name></person-group><article-title>Classifying pulmonary embolism cases in chest CT scans using VGG16 and xgboost</article-title><source>Lecture Notes on Data Engineering and Communications Technologies</source><year>2023</year><volume>131</volume><publisher-name>Springer</publisher-name><fpage>273</fpage><lpage>292</lpage><pub-id pub-id-type="doi">10.1007/978-981-19-1844-5_22</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ding</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Machine learning-based screening of risk factors and prediction of deep vein thrombosis and pulmonary embolism after hip arthroplasty</article-title><source>Clin Appl Thromb Hemost</source><year>2023</year><volume>29</volume><fpage>10760296231186145</fpage><pub-id pub-id-type="doi">10.1177/10760296231186145</pub-id><pub-id pub-id-type="medline">37394825</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>AutoGen: enabling next-gen LLM applications via multi-agent conversation</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 16, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.08155</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verdicchio</surname><given-names>M</given-names> </name><name name-style="western"><surname>Perin</surname><given-names>A</given-names> </name></person-group><article-title>When doctors and AI interact: on human responsibility for artificial risks</article-title><source>Philos Technol</source><year>2022</year><volume>35</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1007/s13347-022-00506-6</pub-id><pub-id pub-id-type="medline">35223383</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name></person-group><article-title>Overtrust of robots in high-risk scenarios</article-title><access-date>2025-02-28</access-date><conf-name>AIES &#x2019;18</conf-name><conf-date>Feb 2-3, 2018</conf-date><conf-loc>New Orleans, LA, United States</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3278721">https://dl.acm.org/doi/proceedings/10.1145/3278721</ext-link></comment><pub-id pub-id-type="doi">10.1145/3278721.3278786</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Sankararaman</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name></person-group><article-title>BayesFormer: transformer with uncertainty estimation</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 2, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2206.00826</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>L</given-names> </name></person-group><article-title>Forward-backward reasoning in large language models for mathematical verification</article-title><access-date>2025-02-28</access-date><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.findings-acl">https://aclanthology.org/2024.findings-acl</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.397</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Ning</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>ZH</given-names> </name><name name-style="western"><surname>Ning</surname><given-names>MN</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>L</given-names> </name></person-group><article-title>LLM lies: hallucinations are not bugs, but features as adversarial examples</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 2, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.01469</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bender</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Gebru</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mcmillan-Major</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shmitchell</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shmitchell</surname><given-names>SG</given-names> </name></person-group><article-title>On the dangers of stochastic parrots: can language models be too big?</article-title><conf-name>FAccT &#x2019;21: Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency</conf-name><conf-date>Mar 3-10, 2021</conf-date><pub-id pub-id-type="doi">10.1145/3442188.3445922</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Raghunathan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>T</given-names> </name></person-group><article-title>An explanation of in-context learning as implicit bayesian inference</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 3, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2111.02080</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>TZ</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>E</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>D</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>Calibrate before use: improving few-shot performance of language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 1, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2102.09690</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>M</given-names> </name></person-group><article-title>Large language models are not robust multiple choice selectors</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 7, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2309.03882</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>X</given-names> </name><name name-style="western"><surname>Barbieri</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>B</given-names> </name><name name-style="western"><surname>Mostafazadeh Davani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Neves</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>X</given-names> </name></person-group><article-title>On transferability of bias mitigation effects in language model fine-tuning</article-title><conf-name>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics</conf-name><conf-date>Jun 6-11, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.296</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>C</given-names> </name></person-group><article-title>Fine-tune language models to approximate unbiased in-context learning</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 5, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.03331</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>Regulation of the european parliament</article-title><source>European Commission</source><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://eur-lex.europa.eu/resource.html?uri=cellar:e0649735-a372-11eb-9585-01aa75ed71a1.0001.02/DOC_1&#x0026;format=PDF">https://eur-lex.europa.eu/resource.html?uri=cellar:e0649735-a372-11eb-9585-01aa75ed71a1.0001.02/DOC_1&#x0026;format=PDF</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="web"><article-title>The Bletchley Declaration</article-title><source>GOV.UK</source><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration/the-bletchley-declaration-by-countries-attending-the-ai-safety-summit-1-2-november-2023">https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration/the-bletchley-declaration-by-countries-attending-the-ai-safety-summit-1-2-november-2023</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><article-title>ANNEX I medical device regulation</article-title><source>Medical Device Regulation</source><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.medical-device-regulation.eu/2019/07/23/annex-i-general-safety-and-performance-requirements/">https://www.medical-device-regulation.eu/2019/07/23/annex-i-general-safety-and-performance-requirements/</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="web"><article-title>Regulating the machine</article-title><source>POLITICO</source><access-date>2025-02-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.politico.eu/article/regulate-europe-race-artificial-intelligence-ai-drugs-medicines/">https://www.politico.eu/article/regulate-europe-race-artificial-intelligence-ai-drugs-medicines/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompting strategies for different GPT models.</p><media xlink:href="mededu_v11i1e58375_app1.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Question&#x2019;s difficulty and error structure of GPT model answers.</p><media xlink:href="mededu_v11i1e58375_app2.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Correct answers of GPT models compared with required and random scores.</p><media xlink:href="mededu_v11i1e58375_app3.xlsx" xlink:title="XLSX File, 9 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Comparison of correct answers between GPT models.</p><media xlink:href="mededu_v11i1e58375_app4.xlsx" xlink:title="XLSX File, 9 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Supplementary analysis of GPT models answers (statistically significant results are highlighted in blue and statistically nonsignificant results are highlighted in brown).</p><media xlink:href="mededu_v11i1e58375_app5.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Confidence of GPT models compared between correct and incorrect answers.</p><media xlink:href="mededu_v11i1e58375_app6.xlsx" xlink:title="XLSX File, 9 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Relationship between question&#x2019;s difficulty, performance, and confidence of GPT model answers.</p><media xlink:href="mededu_v11i1e58375_app7.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Comparison of GPT models justifications between correct and incorrect answers.</p><media xlink:href="mededu_v11i1e58375_app8.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Performance, information content, confidence, and plugin usage of GPT model answers.</p><media xlink:href="mededu_v11i1e58375_app9.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material></app-group></back></article>