<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e63430</article-id><article-id pub-id-type="doi">10.2196/63430</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>ChatGPT-4 Omni Performance in USMLE Disciplines and Clinical Skills: Comparative Analysis</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Bicknell</surname><given-names>Brenton T</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Butler</surname><given-names>Danner</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Whalen</surname><given-names>Sydney</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ricks</surname><given-names>James</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dixon</surname><given-names>Cory J</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Clark</surname><given-names>Abigail B</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Spaedy</surname><given-names>Olivia</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Skelton</surname><given-names>Adam</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Edupuganti</surname><given-names>Neel</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dzubinski</surname><given-names>Lance</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tate</surname><given-names>Hudson</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dyess</surname><given-names>Garrett</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lindeman</surname><given-names>Brenessa</given-names></name><degrees>MD, MEHP</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lehmann</surname><given-names>Lisa Soleymani</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff10">10</xref></contrib></contrib-group><aff id="aff1"><institution>UAB Heersink School of Medicine</institution>, <addr-line>1670 University Blvd</addr-line><addr-line>Birmingham</addr-line><addr-line>AL</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>University of South Alabama Whiddon College of Medicine</institution>, <addr-line>Mobile</addr-line><addr-line>AL</addr-line>, <country>United States</country></aff><aff id="aff3"><institution>University of Illinois College of Medicine</institution>, <addr-line>Chicago</addr-line><addr-line>IL</addr-line>, <country>United States</country></aff><aff id="aff4"><institution>Harvard Medical School</institution>, <addr-line>Boston</addr-line><addr-line>MA</addr-line>, <country>United States</country></aff><aff id="aff5"><institution>Alabama College of Osteopathic Medicine</institution>, <addr-line>Dothan</addr-line><addr-line>AL</addr-line>, <country>United States</country></aff><aff id="aff6"><institution>UT Southwestern Medical Center</institution>, <addr-line>Dallas</addr-line><addr-line>TX</addr-line>, <country>United States</country></aff><aff id="aff7"><institution>Saint Louis University School of Medicine</institution>, <addr-line>St. Louis</addr-line><addr-line>MO</addr-line>, <country>United States</country></aff><aff id="aff8"><institution>Medical College of Georgia, Augusta University</institution>, <addr-line>Augusta</addr-line><addr-line>GA</addr-line>, <country>United States</country></aff><aff id="aff9"><institution>University of Colorado Anschutz Medical Campus School of Medicine</institution>, <addr-line>Aurora</addr-line><addr-line>CO</addr-line>, <country>United States</country></aff><aff id="aff10"><institution>Mass General Brigham</institution>, <addr-line>Boston</addr-line><addr-line>MA</addr-line>, <country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Chartash</surname><given-names>David</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Eysenbach</surname><given-names>Gunther</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Yang</surname><given-names>Dawei</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ochs</surname><given-names>Vincent</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Brenton T Bicknell, BS, UAB Heersink School of Medicine, 1670 University Blvd, Birmingham, AL, 35233, United States, 1 2566539498; <email>brentontbicknell@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>6</day><month>11</month><year>2024</year></pub-date><volume>10</volume><elocation-id>e63430</elocation-id><history><date date-type="received"><day>19</day><month>06</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>09</month><year>2024</year></date><date date-type="accepted"><day>14</day><month>09</month><year>2024</year></date></history><copyright-statement>&#x00A9; Brenton T Bicknell, Danner Butler, Sydney Whalen, James Ricks, Cory J Dixon, Abigail B Clark, Olivia Spaedy, Adam Skelton, Neel Edupuganti, Lance Dzubinski, Hudson Tate, Garrett Dyess, Brenessa Lindeman, Lisa Soleymani Lehmann. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 6.11.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2024/1/e63430"/><abstract><sec><title>Background</title><p>Recent studies, including those by the National Board of Medical Examiners, have highlighted the remarkable capabilities of recent large language models (LLMs) such as ChatGPT in passing the United States Medical Licensing Examination (USMLE). However, there is a gap in detailed analysis of LLM performance in specific medical content areas, thus limiting an assessment of their potential utility in medical education.</p></sec><sec><title>Objective</title><p>This study aimed to assess and compare the accuracy of successive ChatGPT versions (GPT-3.5, GPT-4, and GPT-4 Omni) in USMLE disciplines, clinical clerkships, and the clinical skills of diagnostics and management.</p></sec><sec sec-type="methods"><title>Methods</title><p>This study used 750 clinical vignette-based multiple-choice questions to characterize the performance of successive ChatGPT versions (ChatGPT 3.5 [GPT-3.5], ChatGPT 4 [GPT-4], and ChatGPT 4 Omni [GPT-4o]) across USMLE disciplines, clinical clerkships, and in clinical skills (diagnostics and management). Accuracy was assessed using a standardized protocol, with statistical analyses conducted to compare the models&#x2019; performances.</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4o achieved the highest accuracy across 750 multiple-choice questions at 90.4%, outperforming GPT-4 and GPT-3.5, which scored 81.1% and 60.0%, respectively. GPT-4o&#x2019;s highest performances were in social sciences (95.5%), behavioral and neuroscience (94.2%), and pharmacology (93.2%). In clinical skills, GPT-4o&#x2019;s diagnostic accuracy was 92.7% and management accuracy was 88.8%, significantly higher than its predecessors. Notably, both GPT-4o and GPT-4 significantly outperformed the medical student average accuracy of 59.3% (95% CI 58.3&#x2010;60.3).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4o&#x2019;s performance in USMLE disciplines, clinical clerkships, and clinical skills indicates substantial improvements over its predecessors, suggesting significant potential for the use of this technology as an educational aid for medical students. These findings underscore the need for careful consideration when integrating LLMs into medical education, emphasizing the importance of structured curricula to guide their appropriate use and the need for ongoing critical analyses to ensure their reliability and effectiveness.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>ChatGPT</kwd><kwd>medical education</kwd><kwd>USMLE</kwd><kwd>AI in medical education</kwd><kwd>medical student resources</kwd><kwd>educational technology</kwd><kwd>artificial intelligence in medicine</kwd><kwd>clinical skills</kwd><kwd>LLM</kwd><kwd>medical licensing examination</kwd><kwd>medical students</kwd><kwd>United States Medical Licensing Examination</kwd><kwd>ChatGPT 4 Omni</kwd><kwd>ChatGPT 4</kwd><kwd>ChatGPT 3.5</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Overview</title><p>Recent studies have demonstrated the promise of large language models (LLMs) such as ChatGPT, Google Gemini, and Claude in various medical applications, with studies showing passing United States Medical Licensing Examination (USMLE) exam scores and evaluating LLMs&#x2019; ability to assist with clinical documentation and provide medical advice [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. The potential of these models to revolutionize medicine and medical education underscores the need for a thorough evaluation of their performance [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Before LLMs can be widely adopted in health care and medical education, it is crucial to assess their proficiency in both preclinical disciplines (eg, anatomy, physiology, and microbiology) and clinical disciplines (eg, diagnostics and treatment recommendations).</p></sec><sec id="s1-2"><title>The Role of LLMs in Medical Education</title><p>In the context of undergraduate medical education, LLMs have demonstrated preliminary potential in text-based applications in generating practice questions, fostering case-based learning, creating study guides, and providing rapid answers to relevant questions [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. Although models such as GPT-3.5 offer the potential for a more personalized learning experience, they also have limitations, such as training cut-off dates, limited image capabilities, potential inaccuracies, and a lack of user training [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Medical students often use third-party resources to supplement their studies, with evidence suggesting that such utilization is associated with higher USMLE scores [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. The diverse applications and benefits of LLMs contribute to a comprehensive approach to fostering self-directed learning for lifelong learners in medicine [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. While accuracy remains a limitation of LLMs as clinical tools for students and clinicians, recent studies indicate a trend toward increased reliability and accuracy, a crucial consideration for their use in medical education and health care [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p></sec><sec id="s1-3"><title>Previous Assessments of LLM Accuracy in Medical Contexts</title><p>Comparing multiple studies on the accuracy of LLMs in the context of medicine, such as ChatGPT, is challenging due to variations in question sets, exclusion criteria, and the specific models assessed, though some parallels can be drawn. Most studies have evaluated LLMs based on their ability to correctly answer multiple-choice questions (MCQs) from retired National Board of Medical Examiners&#x2019; (NBME) content or third-party question banks such as Amboss [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. Some studies suggest LLMs perform better on USMLE sample items compared to third-party question banks [<xref ref-type="bibr" rid="ref20">20</xref>], and newer versions of LLMs such as ChatGPT 4 (GPT-4) outperform their earlier counterparts [<xref ref-type="bibr" rid="ref22">22</xref>]. Evaluations of ChatGPT 3.0 found it was able to accurately answer USMLE sample items 36.7% of the time [<xref ref-type="bibr" rid="ref23">23</xref>], improving to more than 50% correct in a matter of months [<xref ref-type="bibr" rid="ref21">21</xref>]. Performance also appears to depend on the specific skills tested and the language used in training [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Further illustrating this in a study by the NBME, ChatGPT scored a passing score in USMLE Step exams across multiple attempts, with one exception in a USMLE Step 3 exam attempt [<xref ref-type="bibr" rid="ref26">26</xref>]. ChatGPT 3.5 (GPT-3.5) was found to answer 63.06% of Step 1 and 70.0% of Step 2 CK questions correctly [<xref ref-type="bibr" rid="ref26">26</xref>]. Most recent studies showcase GPT-4 achieving as high as 86% accuracy on USMLE Step 1 questions, highlighting its near readiness for investigation in improving learning for medical students in preclinical education.</p></sec><sec id="s1-4"><title>Aim of the Study</title><p>While previous research has primarily explored the ability of these models to pass medical licensing exams, this study takes a medical disciplinary approach to assess and compare the accuracy of ChatGPT 3.5 (GPT-3.5), ChatGPT 4 (GPT-4), and ChatGPT 4 Omni (GPT-4o) specifically in the context of the USMLE preclinical medical disciplines and clinical clerkships. These historically recognized USMLE (and NBME [<xref ref-type="bibr" rid="ref27">27</xref>]) preclinical medical disciplines, including anatomy, pathology, and biochemistry, provide a valuable empirical framework to understand the strengths and weaknesses of language models in medical disciplines and clinical skills.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>LLMs: The ChatGPT Series</title><p>In our study, we used the ChatGPT series, which comprises sophisticated algorithms designed to simulate human-like responses to textual inputs. These models generate responses by analyzing input text and predicting output based on learned statistical patterns. ChatGPT 3.5 (GPT-3.5) is the earliest model used in this study and is currently accessible to the public through free subscription [<xref ref-type="bibr" rid="ref28">28</xref>]. ChatGPT 4 (GPT-4), introduced in March 22, 2023 and available through a monthly paid subscription, was included for comparative analysis [<xref ref-type="bibr" rid="ref29">29</xref>]. Notably, we included the latest ChatGPT model, ChatGPT 4 Omni (GPT-4o), which was released on May 13, 2024 [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s2-2"><title>Clinical Vignette-Based Assessment in USMLE Disciplines and Clinical Clerkship</title><p>In total, 750 clinical vignette-style MCQs were sourced from various question banks provided by medical schools to medical students (Amboss, UWorld, TrueLearn). To prevent model &#x201C;learning&#x201D; effects and avoid potential bias from prior usage of publicly available question sets, we selected these MCQs from these sources, which are not publicly accessible.</p><p>The 750 MCQs were divided evenly, with 375 covering USMLE Step 1 (&#x201C;Preclinical&#x201D;) content and 375 covering USMLE Step 2 (&#x201C;Clinical&#x201D;). We applied specific criteria to ensure the relevance and rigor of the questions. Questions involving imaging findings (such as X-rays, MRIs, or ultrasounds), histologic, and gross exam findings were excluded from the study, and an additional clinical vignette was generated in its place. To ensure diversity and reduce bias, questions were sourced by generating random question sessions, with careful attention to avoid duplication of any questions in the final set.</p><p>For each MCQ, we noted whether the vignette pertained to preclinical or clinical subject matter, identified the specific USMLE preclinical discipline or clinical clerkship content assessed, and the percentage of medical students who answered correctly, as detailed by the question bank resources. Using the percentage of medical students who correctly answered each question, we assigned a difficulty tier to each question on a scale from 1 (most difficult) to 5 (easiest) (1: 0%&#x2010;19.9%; 2: 20.0%&#x2010;39.9%; 3: 40.0%&#x2010;59.9%; 4: 60.0%&#x2010;79.9%; 5: 80.0%&#x2010;100%).</p></sec><sec id="s2-3"><title>Protocol for Assessing Accuracy of ChatGPT</title><p>The assessment of the language models was conducted from May 20 to May 26, 2024. The assessment of response accuracy entailed entering the MCQs into a ChatGPT chat session using a standardized protocol based on methodologies similar to those employed in multiple-choice-based language model assessments [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. This protocol for eliciting a response from ChatGPT was as follows: &#x201C;Answer the following question and provide an explanation for your answer choice.&#x201D; Data procured from ChatGPT included its selected response, the rationale for its choice, and whether the response was correct (&#x201C;accurate&#x201D; or &#x201C;inaccurate&#x201D;). Responses were deemed correct if ChatGPT chose the correct multiple-choice answer. To prevent memory retention bias, each vignette was processed in a new chat session.</p></sec><sec id="s2-4"><title>Assessment in Clinical Domains of Diagnostics and Management</title><p>Further subcategorization of the 750 MCQs was made based on their question stem. Question stems assessing the most likely diagnosis (n=168, &#x201C;Diagnostics&#x201D;) or the next best step in treatment (n=178, &#x201C;Management&#x201D;) were noted and used for further comparison to assess accuracy in the clinical skills of diagnostics and management.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>IBM SPSS Statistics 29.0 (IBM Corporation) was used for statistical analyses, with a significance threshold of <italic>P</italic>&#x003C;.05. Statistical tests included chi-squared for categorical comparisons, and binary logistic regression when assessing the influence of question difficulty on language model correct response rate.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>The study did not involve patient data or human subjects and, as such, was not subject to institutional review board approval.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Overall, GPT-4o achieved an overall correct response rate of 90.4%, while GPT-4 had 81.1%, both significantly outperforming GPT-3.5&#x2019;s correct response rate of 60.0% (<xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="fig" rid="figure1">Figure 1</xref>). The average accuracy of medical students was 59.3% (95% CI 58.3&#x2010;60.3).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Response accuracy of the ChatGPT series across USMLE<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> preclinical disciplines and clinical clerkships. Some questions (n=139) could not be categorized due to not having or having multiple categories from sources.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question category or subcategory</td><td align="left" valign="bottom">Questions, N</td><td align="left" valign="bottom" colspan="3">Language model performance, n (%) correct</td><td align="left" valign="bottom">Medical student average, percent correct (95% CI)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">GPT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>-3.5</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">GPT-4o</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top">Overall</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;All questions</td><td align="left" valign="top">750</td><td align="left" valign="top">450 (60.6)</td><td align="left" valign="top">608 (81.1)</td><td align="left" valign="top">678 (90.4)</td><td align="left" valign="top">59.3 (58.3&#x2010;60.3)</td></tr><tr><td align="left" valign="top">Preclinical assessment questions</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;All questions</td><td align="left" valign="top">375</td><td align="left" valign="top">229 (61.1)</td><td align="left" valign="top">301 (80.3)</td><td align="left" valign="top">337 (89.9)</td><td align="left" valign="top">57.7 (56.3&#x2010;59.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>USMLE disciplines</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Anatomy, histology, and embryology</td><td align="left" valign="top">36</td><td align="left" valign="top">21 (58.3)</td><td align="left" valign="top">31 (86.1)</td><td align="left" valign="top">31 (86.1)</td><td align="left" valign="top">50.7 (45.9&#x2010;55.5)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Behavioral and neuroscience</td><td align="left" valign="top">52</td><td align="left" valign="top">40 (76.9)</td><td align="left" valign="top">45 (86.5)</td><td align="left" valign="top">49 (94.2)</td><td align="left" valign="top">53.3 (47.8&#x2010;58.8)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Biochemistry</td><td align="left" valign="top">35</td><td align="left" valign="top">20 (57.1)</td><td align="left" valign="top">28 (80.0)</td><td align="left" valign="top">31 (88.6)</td><td align="left" valign="top">65.1 (57.8&#x2010;72.3)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Biostatistics</td><td align="left" valign="top">21</td><td align="left" valign="top">12 (57.1)</td><td align="left" valign="top">18 (85.7)</td><td align="left" valign="top">17 (81.0)</td><td align="left" valign="top">57.1 (52.7&#x2010;61.6)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Immunology</td><td align="left" valign="top">28</td><td align="left" valign="top">19 (67.9)</td><td align="left" valign="top">23 (82.1)</td><td align="left" valign="top">26 (92.9)</td><td align="left" valign="top">53.5 (48.1&#x2010;58.9)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Microbiology</td><td align="left" valign="top">39</td><td align="left" valign="top">20 (51.3)</td><td align="left" valign="top">30 (76.9)</td><td align="left" valign="top">36 (92.3)</td><td align="left" valign="top">57.7 (52.0&#x2010;63.2)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Pathology</td><td align="left" valign="top">29</td><td align="left" valign="top">17 (58.6)</td><td align="left" valign="top">20 (69.0)</td><td align="left" valign="top">24 (82.8)</td><td align="left" valign="top">64.4 (60.9&#x2010;67.8)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Pharmacology</td><td align="left" valign="top">44</td><td align="left" valign="top">27 (61.3)</td><td align="left" valign="top">37 (84.1)</td><td align="left" valign="top">41 (93.2)</td><td align="left" valign="top">57.9 (53.8&#x2010;62.0)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Physiology</td><td align="left" valign="top">24</td><td align="left" valign="top">13 (54.2)</td><td align="left" valign="top">12 (50.0)</td><td align="left" valign="top">20 (83.3)</td><td align="left" valign="top">51.9 (46.1&#x2010;57.8)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Social sciences</td><td align="left" valign="top">22</td><td align="left" valign="top">13 (59.1)</td><td align="left" valign="top">18 (81.8)</td><td align="left" valign="top">21 (95.5)</td><td align="left" valign="top">66.7 (61.5&#x2010;72.1)</td></tr><tr><td align="left" valign="top">Clinical assessment questions</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;All questions</td><td align="left" valign="top">375</td><td align="left" valign="top">221 (58.9)</td><td align="left" valign="top">307 (81.9)</td><td align="left" valign="top">341 (90.9)</td><td align="left" valign="top">61.0 (59.5&#x2010;62.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical clerkships</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Family medicine</td><td align="left" valign="top">34</td><td align="left" valign="top">20 (59.0)</td><td align="left" valign="top">26 (76.5)</td><td align="left" valign="top">34 (100.0)</td><td align="left" valign="top">54.0 (48.4&#x2010;59.5)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Internal medicine</td><td align="left" valign="top">22</td><td align="left" valign="top">15 (68.2)</td><td align="left" valign="top">21 (95.5)</td><td align="left" valign="top">22 (100.0)</td><td align="left" valign="top">69.2 (65.1&#x2010;73.2)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Neurology</td><td align="left" valign="top">59</td><td align="left" valign="top">41 (69.5)</td><td align="left" valign="top">50 (84.7)</td><td align="left" valign="top">55 (93.2)</td><td align="left" valign="top">61.2 (57.2&#x2010;65.3)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Obstetrics and gynecology</td><td align="left" valign="top">45</td><td align="left" valign="top">24 (53.3)</td><td align="left" valign="top">40 (88.9)</td><td align="left" valign="top">41 (91.1)</td><td align="left" valign="top">61.2 (54.9&#x2010;67.6)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Pediatrics</td><td align="left" valign="top">42</td><td align="left" valign="top">28 (66.7)</td><td align="left" valign="top">32 (76.2)</td><td align="left" valign="top">37 (88.1)</td><td align="left" valign="top">58.3 (54.2&#x2010;62.5)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Psychiatry</td><td align="left" valign="top">43</td><td align="left" valign="top">25 (58.1)</td><td align="left" valign="top">35 (81.4)</td><td align="left" valign="top">40 (93.0)</td><td align="left" valign="top">54.2 (48.5&#x2010;59.8)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;&#x2003;Surgery</td><td align="left" valign="top">36</td><td align="left" valign="top">20 (55.6)</td><td align="left" valign="top">30 (83.3)</td><td align="left" valign="top">31 (86.1)</td><td align="left" valign="top">62.3 (57.4&#x2010;67.1)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>USMLE: United States Medical Licensing Examination.</p></fn><fn id="table1fn2"><p><sup>b</sup>GPT: Generative Pre-trained Transformer.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Analysis of ChatGPT models&#x2019; and medical students&#x2019; performance on USMLE questions. This figure displays the comparative accuracies of ChatGPT 3.5 (GPT-3.5), ChatGPT 4 (GPT-4), ChatGPT 4 Omni (GPT-4o), and medical students in answering a set of 750 USMLE-style questions. The overall accuracy, preclinical accuracy, and clinical accuracy are shown. Asterisks (*) denote statistically significant differences (<italic>P</italic>&#x003C;.05), highlighting the advancements in newer models of the GPT series. The number of questions is indicated for each category: n=750 for overall accuracy, n=375 for preclinical accuracy, and n=375 for clinical accuracy. GPT: Generative Pre-trained Transformer; USMLE: United States Medical Licensing Examination.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e63430_fig01.png"/></fig><sec id="s3-1"><title>USMLE Discipline Response Accuracies</title><p>In total, 375 MCQs designed to assess preclinical content as categorized by USMLE disciplines were administered to GPT-3.5, GPT-4, and GPT-4o. GPT-3.5&#x2019;s highest correct response percentages were in behavioral and neuroscience (76.9%), immunology (67.9%), and pharmacology (61.3%). Conversely, the lowest correct response percentages were observed in physiology (54.2%) and microbiology (51.3%). For GPT-4, the highest correct response percentages were observed in behavioral and neuroscience (86.5%), anatomy, histology, and embryology (86.1%), and pharmacology (84.1%). The lowest correct response percentages for GPT-4 were in physiology (50.0%) and pathology (69.0%). GPT-4o demonstrated the highest correct response percentages in social sciences (95.5%), behavioral and neuroscience (94.2%), and pharmacology (93.2%). The lowest correct response percentages for GPT-4o were in pathology (82.8%) and biostatistics and epidemiology (81.0%).</p></sec><sec id="s3-2"><title>Response Accuracies in Clinical Clerkships</title><p>In total, 375 MCQs assessing clinical clerkship content were administered to GPT-3.5, GPT-4, and GPT-4o. GPT-3.5 exhibited its highest response percentages in neurology (69.5%) and internal medicine (68.2%), while the lowest percentage response accuracies were observed in obstetrics and gynecology (53.3%) and surgery (55.6%). In comparison, GPT-4 achieved higher accuracy across all clerkships, with notable performances in internal medicine (95.5%) and obstetrics and gynecology (88.9%). Similarly, GPT-4o demonstrated improved performance, achieving correct response rates of 93.2% in neurology and 93.0% in psychiatry, as well as 100.0% in family medicine and 100.0% in internal medicine. The lowest accuracies for GPT-4o were still significantly high, with obstetrics and gynecology at 91.1% and surgery at 86.1%. Overall, GPT-4 and GPT-4o showed substantial improvements over GPT-3.5 in all clinical clerkship categories.</p></sec><sec id="s3-3"><title>Vignette Difficulty and Comparisons Based on Respondent Performance</title><p>GPT-3.5 (Exp(B)=1.033, SE=0.005, <italic>P</italic>&#x003C;.001), GPT-4 (Exp(B)=1.039, SE=0.006, <italic>P</italic>&#x003C;.001), and GPT-4o (Exp(B)=1.043, SE=0.008, <italic>P</italic>&#x003C;.001) demonstrated a higher likelihood of responding incorrectly to vignettes that were more challenging for medical student respondents (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Influence of question difficulty on response accuracy compared to medical student performance. This figure illustrates the effect of clinical vignette difficulty on the response accuracy of ChatGPT 3.5 (GPT-3.5), ChatGPT 4 (GPT-4), and ChatGPT 4 Omni (GPT-4o) in comparison to medical students. The bar graph represents the percentage of correct responses across different tiers of difficulty, ranging from tier 1 (most difficult) to tier 5 (easiest). The number of questions for each difficulty tier is n=10 for tier 1, n=89 for tier 2, n=263 for tier 3, n=302 for tier 4, and n=81 for tier 5.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e63430_fig02.png"/></fig></sec><sec id="s3-4"><title>Performance of ChatGPT in Diagnostics and Management</title><p>A total of 342 MCQs were secondarily categorized from the 750 MCQs based on question stems: 164 assessing &#x201C;diagnostics&#x201D; and 178 assessing &#x201C;management.&#x201D; Overall, the respective percent correct response accuracies of GPT-3.5, GPT-4, and GPT-4o in these questions were 70.5% (241/342), 81.9% (280/342), and 88.8% (304/342) (<xref ref-type="fig" rid="figure3">Figure 3</xref>). In the diagnostics category, GPT-4 and GPT-4o demonstrated higher correct response percentages compared to GPT-3.5 (83.5% and 92.7% vs 65.2%). Similarly, in the management category, GPT-4 and GPT-4o outperformed GPT-3.5 (77.0% and 88.8% vs 57.9%). Notably, GPT-4o significantly outperformed GPT-4 in both diagnostics and management.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Performance of ChatGPT models in diagnostics and management compared to medical students. This figure compares the performance of ChatGPT 3.5 (GPT-3.5), ChatGPT 4 (GPT-4), and ChatGPT 4 Omni (GPT-4o) in the clinical domains of diagnostics and management. The bar graph shows the percentage of correct responses for each model and medical students in the diagnosis (n=164) and management (n=178) categories. GPT-4o exhibits the highest accuracy in both categories, followed by GPT-4, with GPT-3.5 showing the lowest performance. Asterisks (*) denote statistically significant differences (<italic>P</italic>&#x003C;.05), emphasizing the advancements in newer models of the GPT series. GPT: Generative Pre-trained Transformer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e63430_fig03.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>This study evaluated ChatGPT versions for their accuracy in USMLE preclinical disciplines, clinical clerkships, and clinical skills categories of diagnostics and management. The aim was to assess the reliability of using LLMs in medical education by examining their accuracy across various preclinical and clinical disciplines. Dependable accuracy in these areas underlies the potential of LLMs to support medical education effectively. Our findings highlighted varied performances across disciplines, with a significant increase in response accuracy observed for GPT-4o over GPT-4 and GPT-3.5.</p></sec><sec id="s4-2"><title>Overall Performance and Disciplinary Accuracies</title><p>Overall, GPT-4o achieved an accuracy rate of 90.4%, significantly outperforming both GPT-3.5 (60.0%) and GPT-4 (81.1%). This improvement is consistent across both preclinical and clinical domains, emphasizing the advancements in model development. GPT-4o&#x2019;s highest preclinical accuracy rates were observed in social sciences (95.5%), behavioral and neuroscience (94.2%), and pharmacology (93.2%). In clinical clerkships, GPT-4o maintained high accuracy, particularly in family medicine and internal medicine, where it achieved a 100% correct response rate, and demonstrated strong performance in neurology and psychiatry. These findings underline GPT-4o&#x2019;s potential utility in medical education and emphasize the necessity of its strategic integration into educational curricula.</p></sec><sec id="s4-3"><title>Question Difficulty and Comparison With Medical Student Performance</title><p>Notably, there was a significant positive correlation between the percentage of correct responses by medical students and the likelihood of correct responses by the LLMS, which indicates that as vignette difficulty increased, the performance of the artificial intelligence (AI) models reflected the difficulty gradient experienced by the students. However, it is worth noting that GPT-4o achieved an overall accuracy of 90.4% in a question set where the medical students average was less than that of a passing USMLE exam score (59.3%).</p></sec><sec id="s4-4"><title>Improvements in Diagnostics and Management</title><p>The clinical vignette-based assessments further illustrated the improvements in GPT-4o in diagnostics and management. In diagnostics, GPT-4o achieved a 92.7% accuracy rate, surpassing GPT-4 (83.5%) and GPT-3.5 (65.2%). Similarly, in management tasks, GPT-4o&#x2019;s accuracy was 88.8%, significantly higher than both GPT-4 (77.0%) and GPT-3.5 (57.9%).</p></sec><sec id="s4-5"><title>Factors Contributing to Improved Performance</title><p>The improvements seen in GPT-4o could be attributed to several advancements in its architecture and the model&#x2019;s training, such as more comprehensive datasets and refined algorithms. This trend of improvement aligns with previous research noting the progressive enhancements in LLMs&#x2019; accuracy and reliability [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. However, an important consideration is the potential interaction between LLM performance and the Flynn effect, which describes the observed rise in intelligence test scores over time. As LLMs are trained on increasingly up-to-date data, they may reflect or even amplify these trends, potentially impacting the psychometric validity of assessments like the USMLE. For instance, environmental influences and the availability of more recent data can significantly impact cognitive performance, a factor that may similarly affect AI models [<xref ref-type="bibr" rid="ref36">36</xref>]. The implications of this interaction warrant further exploration, as understanding these dynamics could provide valuable insights into both the short-term and long-term reliability of LLM-assisted test performance in medical education. Additionally, the recency of the datasets used to train GPT-4 and GPT-4o could be another factor contributing to their improved accuracy compared to GPT-3.5. As these improvements continue, it is essential to assess how they contribute not only to immediate gains in performance but also to the broader implications for long-term educational outcomes and assessment integrity.</p></sec><sec id="s4-6"><title>Considerations for Integration in Medical Education</title><p>Several considerations must be addressed before integrating these models into medical education. The ability to correctly answer USMLE questions is not necessarily the same as synthesizing and reasoning about a patient&#x2019;s history, clinical symptoms, physical exam findings, and laboratory data. This raises the concern of whether LLMs will be able to provide safe and accurate guidance to clinicians at the bedside who are struggling to make sense of a patient&#x2019;s illness. It will therefore be important to assess the value of LLMs in real clinical situations and to assess if and how they can be safely deployed in clinical settings. To address this, medical schools and residency program directors should establish mechanisms to continuously monitor the performance and impact of LLMs used in clinical settings. It would be valuable to create a national registry of feedback from students and faculty to identify errors and unintended consequences associated with the use of LLMs in medical education and clinical care.</p><p>In the context of American medical education, standardized testing environments such as the USMLE play a critical role in shaping the applicability of LLMs like GPT-4o. These models must adapt to a testing culture that heavily emphasizes MCQ formats, which are integral to medical training and licensure in the United States. While LLMs offer potential advantages, there is a risk that over-reliance on AI could hinder the development of essential diagnostic skills in medical students and clinicians [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. This dependency on AI tools may lead to a decline in critical thinking and problem-solving abilities, particularly in situations where AI support is unavailable [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. These concerns underscore the importance of thoughtfully integrating AI into medical education, with careful consideration of its long-term impact on clinical competencies and ethical implications, such as fairness and equity in training future health care professionals [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>].</p></sec><sec id="s4-7"><title>Ethical Implications of AI Integration With Medical Education</title><p>The ethical implications of integrating AI, including LLMs, in medical education and patient care require thorough consideration. Issues such as data privacy, the potential for systemic bias in AI algorithms, and the lack of accountability in AI-driven decisions pose serious challenges. The inherent biases in training data can lead to skewed AI responses, impacting clinical decision-making processes [<xref ref-type="bibr" rid="ref41">41</xref>]. Moreover, the reliance on AI-driven tools raises concerns about the equitable distribution of these technologies, as access often requires paid subscriptions, which could exacerbate disparities in medical education. To mitigate these risks, educational institutions should implement clear guidelines for AI use, including regular audits of AI performance and mandatory training for students and faculty on the limitations and ethical considerations of AI tools. Additionally, establishing dedicated oversight committees to monitor AI integration and address any emerging issues in real-time will be crucial to ensuring these technologies are used responsibly and effectively.</p></sec><sec id="s4-8"><title>Study Limitations</title><p>This study contains several limitations. The 750 MCQs are robust, although they are &#x201C;USMLE-style&#x201D; questions and not actual USMLE exam questions. The exclusion of clinical vignettes involving imaging findings limits the findings to text-based accuracy, which potentially skews the assessment of disciplinary accuracies, particularly in disciplines such as anatomy, microbiology, and histopathology. Additionally, the study does not fully explore the quality of the explanations generated by the AI or its ability to handle complex, higher-order information, which are crucial components of medical education and clinical practice&#x2014;factors that are essential in evaluating the full utility of LLMs in medical education. Previous research has highlighted concerns about the reliability of AI-generated explanations and the risks associated with their use in complex clinical scenarios [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. These limitations are important to consider as they directly impact how well these tools can support clinical reasoning and decision-making processes in real-world scenarios. Moreover, the potential influence of knowledge lagging effects due to the different datasets used by GPT-3.5, GPT-4, and GPT-4o was not explicitly analyzed. Future studies might compare MCQ performance across various years to better understand how the recency of training data affects model accuracy and reliability.</p></sec><sec id="s4-9"><title>Future Research Directions</title><p>Future research should aim to expand the analysis of medical education to incorporate more diverse clinical vignettes, especially those involving imaging and other multimedia content. This would provide a more comprehensive assessment of LLM capabilities. Longitudinal studies are also needed to evaluate the long-term effects of AI integration on learning outcomes and clinical decision-making skills. Moreover, investigating methods to mitigate inherent biases in LLMs and exploring the integration of AI with traditional educational methodologies could provide a more balanced view of the potential and limitations of these technologies in medical training.</p></sec><sec id="s4-10"><title>Conclusions</title><p>In conclusion, this study provides an assessment of the response accuracies of the ChatGPT series across a wide array of USMLE preclinical disciplines and clinical clerkships. The significant improvements observed in ChatGPT 4 Omni suggest substantial potential for its use as a tool for medical education. As the utilization of AI by medical students and clinicians increases, our findings emphasize the need for formal curricula and guidelines that ensure proper usage, as well as the necessity of robust validation and oversight processes for LLMs as they are integrated into medical education.</p></sec></sec></body><back><ack><p>The authors would like to acknowledge the invaluable contributions of Jack Citrin, Ben Kronz, Ben Hambright, Maria Evola, and Olivia Smith, whose assistance as members of our research team was instrumental. The authors also extend their gratitude to AMBOSS, UWorld, and TrueLearn for providing the multiple-choice questions used in this study, without which this research would not have been feasible.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">GPT-3.5</term><def><p>ChatGPT 3.5</p></def></def-item><def-item><term id="abb3">GPT-4</term><def><p>ChatGPT 4</p></def></def-item><def-item><term id="abb4">GPT-4o</term><def><p>ChatGPT 4 Omni</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MCQ</term><def><p>multiple-choice question</p></def></def-item><def-item><term id="abb7">NBME</term><def><p>National Board of Medical Examiners</p></def></def-item><def-item><term id="abb8">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baker</surname><given-names>HP</given-names> </name><name name-style="western"><surname>Dwyer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kalidoss</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hynes</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wolf</surname><given-names>J</given-names> </name><name name-style="western"><surname>Strelzow</surname><given-names>JA</given-names> </name></person-group><article-title>ChatGPT&#x2019;s ability to assist with clinical documentation: a randomized controlled trial</article-title><source>J Am Acad Orthop Surg</source><year>2024</year><month>02</month><day>1</day><volume>32</volume><issue>3</issue><fpage>123</fpage><lpage>129</lpage><pub-id pub-id-type="doi">10.5435/JAAOS-D-23-00474</pub-id><pub-id pub-id-type="medline">37976385</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haupt</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Marks</surname><given-names>M</given-names> </name></person-group><article-title>AI-generated medical advice-GPT and beyond</article-title><source>J Am Med Assoc</source><year>2023</year><month>04</month><day>25</day><volume>329</volume><issue>16</issue><fpage>1349</fpage><lpage>1350</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.5321</pub-id><pub-id pub-id-type="medline">36972070</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kann</surname><given-names>BH</given-names> </name><name name-style="western"><surname>Foote</surname><given-names>MB</given-names> </name><etal/></person-group><article-title>Use of artificial intelligence chatbots for cancer treatment information</article-title><source>JAMA Oncol</source><year>2023</year><month>10</month><day>1</day><volume>9</volume><issue>10</issue><fpage>1459</fpage><lpage>1462</lpage><pub-id pub-id-type="doi">10.1001/jamaoncol.2023.2954</pub-id><pub-id pub-id-type="medline">37615976</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>JH</given-names> </name></person-group><article-title>How chatbots and large language model artificial intelligence systems will reshape modern medicine: fountain of creativity or Pandora&#x2019;s box?</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>596</fpage><lpage>597</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1835</pub-id><pub-id pub-id-type="medline">37115531</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name></person-group><article-title>ChatGPT and the future of medical education</article-title><source>Acad Med</source><year>2023</year><month>08</month><day>1</day><volume>98</volume><issue>8</issue><fpage>867</fpage><lpage>868</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005242</pub-id><pub-id pub-id-type="medline">37162219</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>MEB</given-names> </name><name name-style="western"><surname>Laupichler</surname><given-names>MC</given-names> </name></person-group><article-title>Medical students learning about AI - with AI?</article-title><source>Med Educ</source><year>2023</year><month>11</month><volume>57</volume><issue>11</issue><fpage>1156</fpage><pub-id pub-id-type="doi">10.1111/medu.15211</pub-id><pub-id pub-id-type="medline">37712554</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kirpalani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Grimmer</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>PZT</given-names> </name></person-group><article-title>Med versus machine: Using ChatGPT in team-based learning</article-title><source>Med Educ</source><year>2023</year><month>11</month><volume>57</volume><issue>11</issue><fpage>1159</fpage><lpage>1160</lpage><pub-id pub-id-type="doi">10.1111/medu.15226</pub-id><pub-id pub-id-type="medline">37709349</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abouzeid</surname><given-names>E</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>P</given-names> </name></person-group><article-title>Using AI to produce problem-based learning cases</article-title><source>Med Educ</source><year>2023</year><month>11</month><volume>57</volume><issue>11</issue><fpage>1154</fpage><lpage>1155</lpage><pub-id pub-id-type="doi">10.1111/medu.15213</pub-id><pub-id pub-id-type="medline">37705173</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Heacock</surname><given-names>L</given-names> </name><name name-style="western"><surname>Elias</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT and other large language models are double-edged swords</article-title><source>Radiology</source><year>2023</year><month>04</month><volume>307</volume><issue>2</issue><fpage>e230163</fpage><pub-id pub-id-type="doi">10.1148/radiol.230163</pub-id><pub-id pub-id-type="medline">36700838</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title><source>Healthcare (Basel)</source><year>2023</year><month>03</month><day>19</day><volume>11</volume><issue>6</issue><fpage>887</fpage><pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id><pub-id pub-id-type="medline">36981544</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lawrence</surname><given-names>ECN</given-names> </name><name name-style="western"><surname>Dine</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Kogan</surname><given-names>JR</given-names> </name></person-group><article-title>Preclerkship medical students&#x2019; use of third-party learning resources</article-title><source>JAMA Netw Open</source><year>2023</year><month>12</month><day>1</day><volume>6</volume><issue>12</issue><fpage>e2345971</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.45971</pub-id><pub-id pub-id-type="medline">38048132</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burk-Rafel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Santen</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Purkiss</surname><given-names>J</given-names> </name></person-group><article-title>Study behaviors and USMLE step 1 performance: implications of a student self-directed parallel curriculum</article-title><source>Acad Med</source><year>2017</year><volume>92</volume><issue>11S</issue><fpage>S67</fpage><lpage>S74</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000001916</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Gruppuso</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Adashi</surname><given-names>EY</given-names> </name></person-group><article-title>The self-directed medical student curriculum</article-title><source>J Am Med Assoc</source><year>2021</year><month>11</month><day>23</day><volume>326</volume><issue>20</issue><fpage>2005</fpage><lpage>2006</lpage><pub-id pub-id-type="doi">10.1001/jama.2021.16312</pub-id><pub-id pub-id-type="medline">34724030</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mihalache</surname><given-names>A</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Popovic</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Muni</surname><given-names>RH</given-names> </name></person-group><article-title>ChatGPT-4: an assessment of an upgraded artificial intelligence chatbot in the United States Medical Licensing Examination</article-title><source>Med Teach</source><year>2024</year><month>03</month><volume>46</volume><issue>3</issue><fpage>366</fpage><lpage>372</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2023.2249588</pub-id><pub-id pub-id-type="medline">37839017</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT, GPT-4, and Google Bard on a neurosurgery oral boards preparation question bank</article-title><source>Neurosurgery</source><year>2023</year><month>11</month><day>1</day><volume>93</volume><issue>5</issue><fpage>1090</fpage><lpage>1098</lpage><pub-id pub-id-type="doi">10.1227/neu.0000000000002551</pub-id><pub-id pub-id-type="medline">37306460</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rizzo</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>N</given-names> </name><name name-style="western"><surname>Constantinescu</surname><given-names>D</given-names> </name></person-group><article-title>The performance of ChatGPT on orthopaedic in-service training exams: a comparative study of the GPT-3.5 turbo and GPT-4 models in orthopaedic education</article-title><source>J Orthop</source><year>2024</year><month>04</month><volume>50</volume><fpage>70</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.1016/j.jor.2023.11.056</pub-id><pub-id pub-id-type="medline">38173829</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garabet</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mackey</surname><given-names>BP</given-names> </name><name name-style="western"><surname>Cross</surname><given-names>J</given-names> </name><name name-style="western"><surname>Weingarten</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT-4 performance on USMLE step 1 style questions and its implications for medical education: a comparative study across systems and disciplines</article-title><source>Med Sci Educ</source><year>2024</year><month>02</month><volume>34</volume><issue>1</issue><fpage>145</fpage><lpage>152</lpage><pub-id pub-id-type="doi">10.1007/s40670-023-01956-z</pub-id><pub-id pub-id-type="medline">38510401</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Carignan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>AMBOSS Support</collab></person-group><source>Program overview</source><access-date>2024-05-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://support.amboss.com/hc/en-us/articles/15744010801169-Program-Overview">https://support.amboss.com/hc/en-us/articles/15744010801169-Program-Overview</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name></person-group><article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 28, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2009.13081</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title><source>Sci Rep</source><year>2023</year><month>10</month><day>1</day><volume>13</volume><issue>1</issue><fpage>16492</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id><pub-id pub-id-type="medline">37779171</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yaneva</surname><given-names>V</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jurich</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Swygert</surname><given-names>K</given-names> </name><name name-style="western"><surname>Clauser</surname><given-names>BE</given-names> </name></person-group><article-title>Examining ChatGPT Performance on USMLE Sample Items and Implications for Assessment</article-title><source>Acad Med</source><year>2024</year><month>02</month><day>1</day><volume>99</volume><issue>2</issue><fpage>192</fpage><lpage>197</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005549</pub-id><pub-id pub-id-type="medline">37934828</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>National Board of Medical Examiners</collab></person-group><article-title>Subject examination content: basic science</article-title><source>NBME Subject Examinations: Program Guide</source><year>2023</year><comment><ext-link ext-link-type="uri" xlink:href="https://www.nbme.org/sites/default/files/2022-10/NBME_Subject_Exam_Program_Guide.pdf">https://www.nbme.org/sites/default/files/2022-10/NBME_Subject_Exam_Program_Guide.pdf</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>Introducing ChatGPT</article-title><source>OpenAI</source><year>2022</year><access-date>2024-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/chatgpt">https://openai.com/index/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>GPT-4: OpenAI&#x2019;s most advanced system</article-title><source>OpenAI</source><access-date>2024-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/gpt-4">https://openai.com/index/gpt-4</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Hello GPT-4o: introducing our new flagship model GPT-4o</article-title><source>OpenAI</source><year>2024</year><access-date>2024-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o">https://openai.com/index/hello-gpt-4o</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mihalache</surname><given-names>A</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Popovic</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Muni</surname><given-names>RH</given-names> </name></person-group><article-title>Performance of an upgraded artificial intelligence chatbot for ophthalmic knowledge assessment</article-title><source>JAMA Ophthalmol</source><year>2023</year><month>08</month><day>1</day><volume>141</volume><issue>8</issue><fpage>798</fpage><lpage>800</lpage><pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.2754</pub-id><pub-id pub-id-type="medline">37440220</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cheungpasitporn</surname><given-names>W</given-names> </name></person-group><article-title>Assessing the accuracy of ChatGPT on core questions in glomerular disease</article-title><source>Kidney Int Rep</source><year>2023</year><month>08</month><volume>8</volume><issue>8</issue><fpage>1657</fpage><lpage>1659</lpage><pub-id pub-id-type="doi">10.1016/j.ekir.2023.05.014</pub-id><pub-id pub-id-type="medline">37547515</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meo</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Al-Masri</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Alotaibi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meo</surname><given-names>MZS</given-names> </name><name name-style="western"><surname>Meo</surname><given-names>MOS</given-names> </name></person-group><article-title>ChatGPT knowledge evaluation in basic and clinical medical sciences: multiple choice question examination-based performance</article-title><source>Healthcare (Basel)</source><year>2023</year><month>07</month><day>17</day><volume>11</volume><issue>14</issue><fpage>2046</fpage><pub-id pub-id-type="doi">10.3390/healthcare11142046</pub-id><pub-id pub-id-type="medline">37510487</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Wollenberg</surname><given-names>B</given-names> </name><name name-style="western"><surname>L&#x00FC;ers</surname><given-names>JC</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s quiz skills in different otolaryngology subspecialties: an analysis of 2576 single-choice and multiple-choice board certification preparation questions</article-title><source>Eur Arch Otorhinolaryngol</source><year>2023</year><month>09</month><volume>280</volume><issue>9</issue><fpage>4271</fpage><lpage>4278</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08051-4</pub-id><pub-id pub-id-type="medline">37285018</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Multala</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kearns</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Assessment of ChatGPT&#x2019;s performance on neurology written board examination questions</article-title><source>BMJ Neurol Open</source><year>2023</year><volume>5</volume><issue>2</issue><fpage>e000530</fpage><pub-id pub-id-type="doi">10.1136/bmjno-2023-000530</pub-id><pub-id pub-id-type="medline">37936648</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanaya</surname><given-names>T</given-names> </name><name name-style="western"><surname>Magine</surname><given-names>A</given-names> </name></person-group><article-title>How can the current state of AI guide future conversations of general intelligence?</article-title><source>J Intell</source><year>2024</year><month>03</month><day>20</day><volume>12</volume><issue>3</issue><fpage>36</fpage><pub-id pub-id-type="doi">10.3390/jintelligence12030036</pub-id><pub-id pub-id-type="medline">38535170</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>AlSaad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alhuwail</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>1</day><volume>9</volume><fpage>e48291</fpage><pub-id pub-id-type="doi">10.2196/48291</pub-id><pub-id pub-id-type="medline">37261894</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Balas</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wadden</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>H&#x00E9;bert</surname><given-names>PC</given-names> </name><etal/></person-group><article-title>Exploring the potential utility of AI large language models for medical ethics: an expert panel evaluation of GPT-4</article-title><source>J Med Ethics</source><year>2024</year><month>01</month><day>23</day><volume>50</volume><issue>2</issue><fpage>90</fpage><lpage>96</lpage><pub-id pub-id-type="doi">10.1136/jme-2023-109549</pub-id><pub-id pub-id-type="medline">37945336</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Reese</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Danis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Caufield</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>On the limitations of large language models in clinical diagnosis</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 26, 2024</comment><pub-id pub-id-type="doi">10.1101/2023.07.13.23292613</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nayak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rangan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>JH</given-names> </name></person-group><article-title>Diagnostic reasoning prompts reveal the potential for large language model interpretability in medicine</article-title><source>NPJ Digit Med</source><year>2024</year><month>01</month><day>24</day><volume>7</volume><issue>1</issue><fpage>20</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01010-1</pub-id><pub-id pub-id-type="medline">38267608</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chin</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Afsar-Manesh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bierman</surname><given-names>AS</given-names> </name><etal/></person-group><article-title>Guiding principles to address the impact of algorithm bias on racial and ethnic disparities in health and health care</article-title><source>JAMA Netw Open</source><year>2023</year><month>12</month><day>1</day><volume>6</volume><issue>12</issue><fpage>e2345050</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.45050</pub-id><pub-id pub-id-type="medline">38100101</pub-id></nlm-citation></ref></ref-list></back></article>