<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e52784</article-id><article-id pub-id-type="doi">10.2196/52784</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Influence of Model Evolution and System Roles on ChatGPT&#x2019;s Performance in Chinese Medical Licensing Exams: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Ming</surname><given-names>Shuai</given-names></name><degrees>DM</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Guo</surname><given-names>Qingge</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cheng</surname><given-names>Wenjun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lei</surname><given-names>Bo</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Ophthalmology, Henan Eye Hospital, Henan Provincial People&#x2019;s Hospital</institution>, <addr-line>Zhengzhou</addr-line>, <country>China</country></aff><aff id="aff2"><institution>Eye Institute, Henan Academy of Innovations in Medical Science</institution>, <addr-line>Zhengzhou</addr-line>, <country>China</country></aff><aff id="aff3"><institution>Henan Clinical Research Center for Ocular Diseases, People&#x2019;s Hospital of Zhengzhou University</institution>, <addr-line>Zhengzhou</addr-line>, <country>China</country></aff><aff id="aff4"><institution>Department of Ophthalmology, People&#x2019;s Hospital of Zhengzhou University</institution>, <addr-line>Zhengzhou</addr-line>, <country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mihalache</surname><given-names>Andrew</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yang</surname><given-names>Rui</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Bo Lei, MD, PhD, Department of Ophthalmology, Henan Eye Hospital, Henan Provincial People&#x2019;s Hospital, No.7 Weiwu Road, Zhengzhou, 450003, China, 86 18838105740; <email>bolei99@126.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>13</day><month>8</month><year>2024</year></pub-date><volume>10</volume><elocation-id>e52784</elocation-id><history><date date-type="received"><day>15</day><month>09</month><year>2023</year></date><date date-type="rev-recd"><day>20</day><month>05</month><year>2024</year></date><date date-type="accepted"><day>20</day><month>06</month><year>2024</year></date></history><copyright-statement>&#x00A9; Shuai Ming, Qingge Guo, Wenjun Cheng, Bo Lei. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 13.8.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2024/1/e52784"/><abstract><sec><title>Background</title><p>With the increasing application of large language models like ChatGPT in various industries, its potential in the medical domain, especially in standardized examinations, has become a focal point of research.</p></sec><sec><title>Objective</title><p>The aim of this study is to assess the clinical performance of ChatGPT, focusing on its accuracy and reliability in the Chinese National Medical Licensing Examination (CNMLE).</p></sec><sec sec-type="methods"><title>Methods</title><p>The CNMLE 2022 question set, consisting of 500 single-answer multiple choices questions, were reclassified into 15 medical subspecialties. Each question was tested 8 to 12 times in Chinese on the OpenAI platform from April 24 to May 15, 2023. Three key factors were considered: the version of GPT-3.5 and 4.0, the prompt&#x2019;s designation of system roles tailored to medical subspecialties, and repetition for coherence. A passing accuracy threshold was established as 60%. The <italic>&#x03C7;</italic><sup>2</sup> tests and &#x03BA; values were employed to evaluate the model&#x2019;s accuracy and consistency.</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4.0 achieved a passing accuracy of 72.7%, which was significantly higher than that of GPT-3.5 (54%; <italic>P</italic>&#x003C;.001). The variability rate of repeated responses from GPT-4.0 was lower than that of GPT-3.5 (9% vs 19.5%; <italic>P</italic>&#x003C;.001). However, both models showed relatively good response coherence, with &#x03BA; values of 0.778 and 0.610, respectively. System roles numerically increased accuracy for both GPT-4.0 (0.3%&#x2010;3.7%) and GPT-3.5 (1.3%&#x2010;4.5%), and reduced variability by 1.7% and 1.8%, respectively (<italic>P</italic>&#x003E;.05). In subgroup analysis, ChatGPT achieved comparable accuracy among different question types (<italic>P</italic>&#x003E;.05). GPT-4.0 surpassed the accuracy threshold in 14 of 15 subspecialties, while GPT-3.5 did so in 7 of 15 on the first response.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4.0 passed the CNMLE and outperformed GPT-3.5 in key areas such as accuracy, consistency, and medical subspecialty expertise. Adding a system role insignificantly enhanced the model&#x2019;s reliability and answer coherence. GPT-4.0 showed promising potential in medical education and clinical practice, meriting further study.</p></sec></abstract><kwd-group><kwd>ChatGPT</kwd><kwd>Chinese National Medical Licensing Examination</kwd><kwd>large language models</kwd><kwd>medical education</kwd><kwd>system role</kwd><kwd>LLM</kwd><kwd>LLMs</kwd><kwd>language model</kwd><kwd>language models</kwd><kwd>artificial intelligence</kwd><kwd>chatbot</kwd><kwd>chatbots</kwd><kwd>conversational agent</kwd><kwd>conversational agents</kwd><kwd>exam</kwd><kwd>exams</kwd><kwd>examination</kwd><kwd>examinations</kwd><kwd>OpenAI</kwd><kwd>answer</kwd><kwd>answers</kwd><kwd>response</kwd><kwd>responses</kwd><kwd>accuracy</kwd><kwd>performance</kwd><kwd>China</kwd><kwd>Chinese</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>ChatGPT, a general large language model (LLM) developed by OpenAI, has gained substantial attention since its launch on November 30, 2022. Known for its advanced natural language processing capabilities, ChatGPT has the potential to make significant impacts on many industries, including medical education. Its performance in medicine was first tested at or near the passing threshold of the United States Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. While ChatGPT&#x2019;s accuracy varies across languages [<xref ref-type="bibr" rid="ref3">3</xref>], it has been tested on a series of medical exams like the Japanese National Medical Licensing Examination in languages including English [<xref ref-type="bibr" rid="ref4">4</xref>], Chinese [<xref ref-type="bibr" rid="ref5">5</xref>], Dutch [<xref ref-type="bibr" rid="ref6">6</xref>], Japanese [<xref ref-type="bibr" rid="ref7">7</xref>], and Korean [<xref ref-type="bibr" rid="ref8">8</xref>]. The research scope related to ChatGPT has expanded to medical education in fields like nuclear medicine [<xref ref-type="bibr" rid="ref9">9</xref>], neurosurgery [<xref ref-type="bibr" rid="ref10">10</xref>], ophthalmology [<xref ref-type="bibr" rid="ref11">11</xref>], general chemistry, nursing[<xref ref-type="bibr" rid="ref12">12</xref>], life support [<xref ref-type="bibr" rid="ref4">4</xref>], dentology [<xref ref-type="bibr" rid="ref13">13</xref>], and radiation oncology physics [<xref ref-type="bibr" rid="ref14">14</xref>]. Overall, while ChatGPT demonstrates heterogeneous capabilities, it shows promising potential in these medical specialties.</p><p>Several factors might influence ChatGPT&#x2019;s performance. First, the updated version of ChatGPT, GPT-4, understands and generates natural language in more complex and nuanced scenarios, leading to more accurate responses [<xref ref-type="bibr" rid="ref15">15</xref>], which is important in analyzing complex clinical case questions [<xref ref-type="bibr" rid="ref16">16</xref>]. Thus, GPT-4 conclusively demonstrated significantly better performance than GPT-3.5, as evidenced by various official medical exams [<xref ref-type="bibr" rid="ref8">8</xref>]. Besides the model version, ChatGPT allows users to guide its behavior by adding prompts that describe its system role. These system roles influence the direction of ChatGPT&#x2019;s answers and may affect its reliability. However, the impact of these system roles on ChatGPT&#x2019;s performance in medical field has not yet been investigated. As a professional chatbot tool, ChatGPT uses sampling to predict the next token with varying distribution probabilities, ensuring responses are varied and natural in real-world applications. Zhu et al [<xref ref-type="bibr" rid="ref17">17</xref>] have found that composite answers derived from repeated questioning can enhance the accuracy of ChatGPT. Typically, 2 or 3 repeated responses are necessary to ensure response stability [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>Currently, the peer-reviewed research still lacks highlights on the strength of ChatGPT when it comes to the Chinese National Medical Licensing Examination (CNMLE). This study aimed to evaluate the performance of ChatGPT in answering CNMLE questions in the clinical setting of China, with consideration of the version of ChatGPT and system role.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>The CNMLE 2022 Question Data</title><p>As an industry admission examination, passing the CNMLE means that a medical practitioner meets the minimum medical competencies. The written part of the examination, which emphasizes medical knowledge and clinical decision-making skills, is created and supervised by the Chinese National Medical Examination Center (NMEC). In 2021, the CNMLE transitioned from the traditional paper-based format to a computer-based examination. Each candidate is presented with 600 questions, arranged in a slightly varied order, from the exam year&#x2019;s question data set. According to OpenAI&#x2019;s introduction, ChatGPT&#x2019;s responses are based on information available up to September 2021. Thus, we selected the CNMLE 2022 questions, which were purchased from a web-based bookstore [<xref ref-type="bibr" rid="ref21">21</xref>], for our evaluation. This choice ensured that the questions had not been previously encountered and trained by the model. The publisher has confirmed that these released questions are the original ones from the examination.</p><p>The CNMLE 2022 covered 600 single-answer multiple-choice questions, which were evenly divided into 4 units [<xref ref-type="bibr" rid="ref22">22</xref>]. Each unit had 4 specific question types: A1, the single-sentence optimal choice questions; A2, case summary optimal choice questions; A3/A4, case group optimal choice questions; and B1, standard combination questions. Detailed explanations of each question type was conveyed to ChatGPT via a structured prompt prior to inquiry (see in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The CNMLE 2022 questions did not involve table or image-based questions. Therefore, ChatGPT, despite lacking multimodal capabilities, was still suited to effectively complete the test.</p><p>According to the introduction of the Chinese NMEC [<xref ref-type="bibr" rid="ref22">22</xref>], each examination unit always addresses specific medical subspecialties. Unit 1 covers medical knowledge, policies, regulations, and preventive medicine; unit 2 mainly pertains to the cardiovascular, urinary, musculoskeletal, and endocrine systems; unit 3 involves the digestive, respiratory, and associated systems; unit 4 focuses on obstetrics and gynecology, pediatrics, and neurological or psychiatric domains. However, such distribution is not absolute. Therefore, 2 clinicians independently reclassified the 600 questions into 15 medical subspecialties, resolving discrepancies through discussion. The &#x03BA; value for the result of their classifications was 0.935. The Sankey diagram of the 3 question classifications, medical subspecialties, units, and types is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The Sankey diagram of the 3 question classifications: the medical subspecialties, units, and types. STD: sexually transmitted disease.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e52784_fig01.png"/></fig></sec><sec id="s2-2"><title>Instructions Before Testing Part</title><p>Before manually inputting questions, ChatGPT was informed about an upcoming series of queries. ChatGPT needed to identify the most plausible response from the available options and explain the reasoning behind its selection. The question types determined the relevant lead-in prompts provided. For the A1 and A2 question types, each input question was deemed independent, rendering any interquestion relationships irrelevant. In contrast, A3/A4 question types implied that multiple questions within a single clinical case shared a connection. However, individual clinical cases were treated as discrete entities, eliminating the need to consider relationships between them. For the B1 question type, 5 shared options were given. ChatGPT needed to identify the correct answers for subsequent questions. Chaining was used in A3/A4 and B1 question types to ensure that multiple questions within a single clinical case in A3/A4 shared the same context, and multiple questions in B1 shared the same options. The number of questions inputted at one time depended on the text&#x2019;s length, such as 5&#x2010;8 questions for A1/A2 types. If necessary, ChatGPT was forced to disregard prior conversational content and commence a fresh chat.</p></sec><sec id="s2-3"><title>Temperature</title><p>The temperature parameter in ChatGPT influences the randomness of the model&#x2019;s responses. A higher temperature yields more varied and creative answers. In our study, we did not manually adjust the temperature; instead, we used the default setting on the OpenAI platform, commonly at 0.7, to simulate real-world user interactions on the front end. This balance between typical user habits and diverse thought processes was intentional. The default relatively high temperature was expected to enable ChatGPT to generate more creative reasoning processes while still arriving at accurate answers.</p></sec><sec id="s2-4"><title>Testing Strategy</title><p>All the CNMLE 2022 questions were tested in Chinese according to the following 2 factors:</p><list list-type="order"><list-item><p>ChatGPT model selection. Both GPT-3.5 (version from March 23) and GPT-4.0 (version from May 3) were rigorously evaluated on the OpenAI platform from April 24 to May 15, 2023, to ascertain any evolution in the model&#x2019;s capability in the medical domain.</p></list-item><list-item><p>System role. This refers to the specific identity or role, such as &#x201C;gastroenterology specialist,&#x201D; assigned to ChatGPT to determine if relevant knowledge is applied more accurately. Questions were evaluated both with and without assigning a system role related to the 15 specific clinical subspecialties. This system role was designated by providing a tailored system prompt before the testing instructions, aiming to guide ChatGPT&#x2019;s approach and align it with specialist viewpoints in the relevant medical field.</p></list-item></list></sec><sec id="s2-5"><title>Testing Process</title><p>Considering the evaluation of the ChatGPT model, system role, and response coherence, each question was tested 8&#x2010;12 times. The prompts included those specific to question types, the assignment of system role, and the use of chaining. Slight modifications in these prompts were adopted to avoid potential systematic errors introduced by rigid wording. For example, the prompt &#x201C;Assume you are a gastroenterology specialist&#x201D; might vary as &#x201C;Assume you are highly proficient in gastroenterology.&#x201D; For coherence evaluation, each question was presented again to ChatGPT. If the regenerated response matched the initial answer, the process was halted. However, if the 2 responses differed, the question was posed once more to ChatGPT.</p></sec><sec id="s2-6"><title>Response Determination</title><p>The first and second responses from ChatGPT were directly assessed against the given standard answers for accuracy. For the final response (referred to as joint response), if 2 of the 3 answers were consistent, this was taken as the conclusive answer and evaluated against the standard. However, if the 3 responses were all distinct, it was automatically marked as incorrect without any further comparison to the standard answer.</p><p>The first response was more applicable to assessing whether ChatGPT could pass the CNMLE in the same situation as a student examinee. In contrast, the joint response represented an overall accuracy (the proportion of questions answered correctly at least twice) [<xref ref-type="bibr" rid="ref17">17</xref>], which was more suitable for demonstrating the potential of ChatGPT in medical education.</p><p>According to the announcement from the CNMLE Committee of the National Health Commission of China, the passing score for licensed physicians is 360 points, which means an accuracy rate of 60% or above is considered a pass.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Data were collected and managed using Excel software. The statistical analyses were conducted with SPSS (version 26.0.0; IBM Corp). A <italic>&#x03C7;</italic><sup>2</sup> test was used to compare the accuracy of CNMLE question responses between different testing strategies and subgroups of question types. Variability was calculated by the number of consistently correct or wrong answers in 2 repeated responses divided by the total number of questions (600). Additionally, the &#x03BA; statistic was used to evaluate answer consistency. A difference was considered statistically significant when <italic>P</italic>&#x003C;.05.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study collected information that was already published in the bookstore and did not involve human subjects; therefore, approval by the Institutional Review Board of Henan Provincial People&#x2019;s Hospital was not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Accuracy and System Role Assignment</title><p>In model comparison, GPT-3.5 achieved an initial accuracy of 54% (324/600) and did not meet the exam criteria. Conversely, GPT-4.0 achieved a passing accuracy of 72.7% (436/600), which was significantly higher than GPT-3.5 (<italic>P</italic>&#x003C;.001). Similarly, with a designated system role, GPT-4.0 still exhibited higher accuracy than GPT-3.5 (73% vs 55.3%; <italic>P</italic>&#x003C;.001).</p><p>Upon system role assignment, both GPT-3.5 and GPT-4.0 showed a slight increase in accuracy compared to when no role was assigned; specifically, 55.3% (332/500) from 54% (324/600) for GPT-3.5 (<italic>P</italic>&#x003E;.05) and 73% (438/600) from 72.7% (436/600) for GPT-4.0 (<italic>P</italic>&#x003E;.05).</p><p>The upper comparisons for the second and joint responses paralleled the initial results, as shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Accuracy of GPT-4.0 and 3.5 with or without SR designation under repeat tests. n represents the number of correct answers.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">GPT-3.5, n (%)</td><td align="left" valign="bottom">GPT-4.0, n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">GPT-3.5 + SR<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">GPT-4.0 + SR, n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">IR<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">324 (54.0)</td><td align="left" valign="top">436 (72.7)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">332 (55.3)</td><td align="left" valign="top">438 (73.0)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">2R<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">303 (50.5)</td><td align="left" valign="top">426 (71.0)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">310 (51.7)</td><td align="left" valign="top">448 (74.7)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">JR<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">302 (50.3)</td><td align="left" valign="top">435 (72.5)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">329 (54.8)</td><td align="left" valign="top">437 (72.8)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>SR: system role.</p></fn><fn id="table1fn2"><p><sup>b</sup>IR: initial response.</p></fn><fn id="table1fn3"><p><sup>c</sup>2R: second response.</p></fn><fn id="table1fn4"><p><sup>d</sup>JR: joint response.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Variability of Responses</title><p>The GPT-3.5 model exhibited a variability rate of 19.5% (117/600), which decreased to 17.7% (106/600) upon the designation of a system role. The variability rate for GPT-4.0 was observed at 9% (54/600), and further reduced to 7.3% after a system role was assigned. These results indicated a smaller response variability for GPT-4.0 compared to GPT-3.5, and specifying system roles also decreased the variability rates. Both models showed relatively high coherence between the initial and second response, with &#x03BA; values of 0.778 and 0.610. Detailed information for repeated response can be seen in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s3-3"><title>Accuracy for Subgroups</title><p>For GPT-4.0, when accounting for system role and repeated responses, there was a statistically significant difference in accuracy across the different units for the CNMLE test, with accuracy ranging from 62% (93/150) to 84% (126/150; <italic>P</italic> range from&#x003C;.001 to .01). However, when grouped by question type, the accuracy ranged from 69.4% (145/209) to 83.1% (59/71) without statistical difference (<italic>P</italic>&#x003E;.28).</p><p>In contrast, for GPT-3.5, only the initial response with system role designation showed a statistical difference in accuracy (<italic>P</italic>=.04) for question type subgroups. In other groupings by unit or question type, as well as in subsequent responses, the accuracy remained without significant variations (<italic>P</italic>&#x003E;.14; see <xref ref-type="table" rid="table2">Table 2</xref>).</p><p>Accuracy for initial and joint responses of GPT-3.5/4.0 classified by 15 medical subspecialties is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. In multiple testing strategies, GPT-4.0 outperformed GPT-3.5 in accuracy for 14 distinct clinical subspecialty questions, consistently surpassing the 60% passing threshold.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Subgroup analysis of accuracy for the 4 sections and 4 question types under different strategies. Data were showed as n (%). Units 1&#x2010;4 were the 4 parts to which the questions belonged, and A1-A2, B1 represented the types of questions. Units 1&#x2010;4 corresponded to distinct clinical subspecialties, with specific details provided in the Methods section.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model strategy</td><td align="left" valign="bottom">Unit 1<break/>(n=150), n (%)</td><td align="left" valign="bottom">Unit 2<break/>(n=150), n (%)</td><td align="left" valign="bottom">Unit 3<break/>(n=150), n (%)</td><td align="left" valign="bottom">Unit 4<break/>(n=150), n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">A1<break/>(n=220), n (%)</td><td align="left" valign="bottom">A2<break/>(n=209), n (%)</td><td align="left" valign="bottom">A3/A4<break/>(n=100), n (%)</td><td align="left" valign="bottom">B1<break/>(n=71), n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">GPT3.5: IR<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">82 (54.7)</td><td align="left" valign="top">83 (55.3)</td><td align="left" valign="top">88 (58.7)</td><td align="left" valign="top">71 (47.3)</td><td align="left" valign="top">.25</td><td align="left" valign="top">122 (55.5)</td><td align="left" valign="top">109 (52.2)</td><td align="left" valign="top">59 (59.0)</td><td align="left" valign="top">34 (47.9)</td><td align="left" valign="top">.47</td></tr><tr><td align="left" valign="top">GPT3.5: 2R<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">71 (47.3)</td><td align="left" valign="top">77 (51.3)</td><td align="left" valign="top">85 (56.7)</td><td align="left" valign="top">70 (46.7)</td><td align="left" valign="top">.28</td><td align="left" valign="top">115 (52.3)</td><td align="left" valign="top">103 (49.3)</td><td align="left" valign="top">57 (57.0)</td><td align="left" valign="top">28 (39.4)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top">GPT3.5: JR<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">72 (48.0)</td><td align="left" valign="top">75 (50.0)</td><td align="left" valign="top">86 (57.3)</td><td align="left" valign="top">69 (46.0)</td><td align="left" valign="top">.22</td><td align="left" valign="top">114 (51.8)</td><td align="left" valign="top">101 (48.3)</td><td align="left" valign="top">57 (57.0)</td><td align="left" valign="top">30 (42.3)</td><td align="left" valign="top">.24</td></tr><tr><td align="left" valign="top">GPT3.5: IR+ SR<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">85 (56.7)</td><td align="left" valign="top">84 (56.0)</td><td align="left" valign="top">91 (60.7)</td><td align="left" valign="top">72 (48.0)</td><td align="left" valign="top">.16</td><td align="left" valign="top">129 (58.6)</td><td align="left" valign="top">113 (54.1)</td><td align="left" valign="top">61 (61.0)</td><td align="left" valign="top">29 (40.8)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">GPT3.5: 2R + SR</td><td align="left" valign="top">83 (55.3)</td><td align="left" valign="top">74 (49.3)</td><td align="left" valign="top">82 (54.7)</td><td align="left" valign="top">71 (47.3)</td><td align="left" valign="top">.42</td><td align="left" valign="top">121 (55.0)</td><td align="left" valign="top">102 (48.8)</td><td align="left" valign="top">57 (57.0)</td><td align="left" valign="top">30 (42.3)</td><td align="left" valign="top">.15</td></tr><tr><td align="left" valign="top">GPT3.5: JR+ SR</td><td align="left" valign="top">84 (56.0)</td><td align="left" valign="top">80 (53.3)</td><td align="left" valign="top">91 (60.7)</td><td align="left" valign="top">74 (49.3)</td><td align="left" valign="top">.25</td><td align="left" valign="top">126 (57.3)</td><td align="left" valign="top">110 (52.6)</td><td align="left" valign="top">61 (61.0)</td><td align="left" valign="top">32 (45.1)</td><td align="left" valign="top">.16</td></tr><tr><td align="left" valign="top">GPT4.0: IR</td><td align="left" valign="top">102 (68.0)</td><td align="left" valign="top">118 (78.7)</td><td align="left" valign="top">119 (79.3)</td><td align="left" valign="top">97 (64.7)</td><td align="left" valign="top">.006</td><td align="left" valign="top">154 (70.0)</td><td align="left" valign="top">152 (72.7)</td><td align="left" valign="top">79 (79.0)</td><td align="left" valign="top">51 (71.8)</td><td align="left" valign="top">.42</td></tr><tr><td align="left" valign="top">GPT4.0: 2R</td><td align="left" valign="top">100 (66.7)</td><td align="left" valign="top">112 (74.7)</td><td align="left" valign="top">119 (79.3)</td><td align="left" valign="top">95 (63.3)</td><td align="left" valign="top">.009</td><td align="left" valign="top">155 (70.5)</td><td align="left" valign="top">145 (69.4)</td><td align="left" valign="top">76 (76.0)</td><td align="left" valign="top">50 (70.4)</td><td align="left" valign="top">.68</td></tr><tr><td align="left" valign="top">GPT4.0: JR</td><td align="left" valign="top">104 (69.3)</td><td align="left" valign="top">114 (76.0)</td><td align="left" valign="top">121 (80.7)</td><td align="left" valign="top">96 (64.0)</td><td align="left" valign="top">.007</td><td align="left" valign="top">157 (71.4)</td><td align="left" valign="top">146 (69.9)</td><td align="left" valign="top">79 (79.0)</td><td align="left" valign="top">53 (74.6)</td><td align="left" valign="top">.37</td></tr><tr><td align="left" valign="top">GPT4.0: IR+ SR</td><td align="left" valign="top">103 (68.7)</td><td align="left" valign="top">116 (77.3)</td><td align="left" valign="top">126 (84.0)</td><td align="left" valign="top">93 (62.0)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">157 (71.4)</td><td align="left" valign="top">151 (72.2)</td><td align="left" valign="top">72 (72.0)</td><td align="left" valign="top">58 (81.7)</td><td align="left" valign="top">.37</td></tr><tr><td align="left" valign="top">GPT4.0: 2R + SR</td><td align="left" valign="top">104 (69.3)</td><td align="left" valign="top">117 (78.0)</td><td align="left" valign="top">124 (82.7)</td><td align="left" valign="top">103 (68.7)</td><td align="left" valign="top">.01</td><td align="left" valign="top">159 (72.3)</td><td align="left" valign="top">153 (73.2)</td><td align="left" valign="top">77 (77.0)</td><td align="left" valign="top">59 (83.1)</td><td align="left" valign="top">.28</td></tr><tr><td align="left" valign="top">GPT4.0: JR+ SR</td><td align="left" valign="top">101 (67.3)</td><td align="left" valign="top">115 (76.7)</td><td align="left" valign="top">124 (82.7)</td><td align="left" valign="top">97 (64.7)</td><td align="left" valign="top">.001</td><td align="left" valign="top">156 (70.9)</td><td align="left" valign="top">151 (72.2)</td><td align="left" valign="top">73 (73.0)</td><td align="left" valign="top">57 (80.3)</td><td align="left" valign="top">.47</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>IR: initial response.</p></fn><fn id="table2fn2"><p><sup>b</sup>2R: second response.</p></fn><fn id="table2fn3"><p><sup>c</sup>JR: joint response.</p></fn><fn id="table2fn4"><p><sup>d</sup>SR: system role.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Accuracy for GPT-3.5/4.0 classified by 15 medical subspecialties. (A) the initial response, (B) the initial response with SR assignation, (C) the joint response, (D) the joint response with SR assignation. SR: system role; STD: sexually transmitted disease.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e52784_fig02.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>The CNMLE syllabus outlines the essential knowledge and competencies that physicians need for diagnostic and therapeutic procedures. Acquiring these competencies typically demands that a medical student invest several years in both theoretical education and practical skill development. The application of ChatGPT in medical examinations, particularly within the CNMLE framework, offers a pioneering approach to gauge the potential of LLMs in clinical diagnosis and treatment planning. This study comprehensively assessed ChatGPT&#x2019;s performance in addressing CNMLE questions, focusing on model evolution and system role designation, which has not yet been fully investigated.</p></sec><sec id="s4-2"><title>Model Evolution and Performance</title><p>In our study, GPT-4.0 consistently outperformed GPT-3.5 in accuracy and reliably met the passing criteria set by the CNMLE Committee. Despite GPT-3.5 achieving an accuracy rate of over 50%, it failed to pass the examination. A noncomparison study using GPT-3.5 to test CNMLE 2020&#x2010;2022 achieved an accuracy of (36.5%&#x2010;47%) [<xref ref-type="bibr" rid="ref23">23</xref>]. The lower accuracy might be attributed to the fact that the testing was conducted before February, shortly after the release of GPT-3.5. The better performance of GPT-4.0 compared with GPT-3.5 was also reported by Wang et al [<xref ref-type="bibr" rid="ref24">24</xref>]. However, it is noteworthy that their assessment was based on a limited sample of 100 questions, rather than a full set of 600 questions. The small sample might have contributed to the overall favorable results (GPT-4.0: 84%; GPT-3.5: 56%). Therefore, our findings might provide a more representative comparison of the real-world performance of GPT-4.0 and 3.5 on the CNMLE.</p><p>Other research on evaluating ChatGPT&#x2019;s accuracy on national medical licensing examinations included assessments of the USMLE [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>] and the Japanese National Medical Licensing Examination [<xref ref-type="bibr" rid="ref7">7</xref>]. The conclusions were similar to ours: while GPT-3.5 was often at or near the passing threshold, GPT-4.0 passed relevant exams and had higher testing accuracy compared to GPT-3.5. This trend was not only limited to national medical licensing examinations but also applied to other medical-related examinations. However, the specific accuracy varied across models, possibly due to differences in study countries, testing time, exam content, and other variables. A comprehensive review of existing published and non&#x2013;peer-reviewed research findings is available in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>A review of the existing published and non&#x2013;peer-reviewed research related to ChatGPT performance on medical examinations.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom">Country</td><td align="left" valign="bottom">Test model</td><td align="left" valign="bottom">Examination</td><td align="left" valign="bottom">Data sample, n</td><td align="left" valign="bottom">Passing threshold</td><td align="left" valign="bottom">Accuracy (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Gilson et al [<xref ref-type="bibr" rid="ref1">1</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">The United States Medical Licensing Examination Step 1 and Step 2 exams</td><td align="left" valign="top">87&#x2010;102</td><td align="left" valign="top">60%</td><td align="left" valign="top">GPT-3.5: 44.0&#x2010;64.4</td></tr><tr><td align="left" valign="top">Kung et al [<xref ref-type="bibr" rid="ref2">2</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">The United States Medical Licensing Exam</td><td align="left" valign="top">376</td><td align="left" valign="top">60%</td><td align="left" valign="top">At or near 60%</td></tr><tr><td align="left" valign="top">Guerra et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Congress of Neurological Surgeons Self-Assessment Neurosurgery Exam</td><td align="left" valign="top">591</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">GPT-4.0: 76.6; GPT-3.5: 60.2</td></tr><tr><td align="left" valign="top">Takagi et al [<xref ref-type="bibr" rid="ref7">7</xref>]</td><td align="left" valign="top">Japan</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Japanese National Medical Licensing Examination (2023)</td><td align="left" valign="top">254</td><td align="left" valign="top">GPT-4.0: Pass; GPT-3.5: Failed</td><td align="left" valign="top">GPT-4.0: 79.9; GPT-3.5: 50.8</td></tr><tr><td align="left" valign="top">Wang et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">The Chinese National Medical Licensing<break/>Examination</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">GPT-4.0: 84; GPT-3.5: 56</td></tr><tr><td align="left" valign="top">Cai et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Ophthalmology Board-Style Questions</td><td align="left" valign="top">250</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">GPT-4.0: 71.6; GPT-3.5: 58.8</td></tr><tr><td align="left" valign="top">Oh et al [<xref ref-type="bibr" rid="ref8">8</xref>]</td><td align="left" valign="top">Korea</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Korean General Surgery Board Exams</td><td align="left" valign="top">280</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">GPT-4.0: 76.4; GPT-3.5: 46.8</td></tr><tr><td align="left" valign="top">Skalidis et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">Switzerland</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">The European Exam in Core Cardiology</td><td align="left" valign="top">488</td><td align="left" valign="top">Pass</td><td align="left" valign="top">GPT-3.5: 58.8</td></tr><tr><td align="left" valign="top">Saad et al [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">United Kingdom</td><td align="left" valign="top">GPT-4.0</td><td align="left" valign="top">The Orthopaedic FRCS Orth Part A exam</td><td align="left" valign="top">240</td><td align="left" valign="top">Failed</td><td align="left" valign="top">GPT-4.0: 67.5</td></tr><tr><td align="left" valign="top">Weng et al [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Taiwan&#x2019;s 2022 Family Medicine Board Exam</td><td align="left" valign="top">125</td><td align="left" valign="top">Failed</td><td align="left" valign="top">GPT-3.5: 41.6</td></tr><tr><td align="left" valign="top">Kumah-Crystal et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">The Clinical Informatics Board Examination</td><td align="left" valign="top">254</td><td align="left" valign="top">60%, Pass</td><td align="left" valign="top">GPT-3.5: 74</td></tr><tr><td align="left" valign="top">Mihalache et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">Canada</td><td align="left" valign="top">GPT-4.0</td><td align="left" valign="top">OphthoQuestions practice question bank for board certification examination</td><td align="left" valign="top">125</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">GPT-4.0: 84</td></tr><tr><td align="left" valign="top">Ali et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Self-Assessment Neurosurgery Examination Indications Examination</td><td align="left" valign="top">149</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">GPT-4.0: 82.6; GPT-3.5: 62.4</td></tr><tr><td align="left" valign="top">Oztermeliet al [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">Turkey</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Turkey Medical Specialty Exams</td><td align="left" valign="top">1177</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">GPT-3.5: 54.3&#x2010;70.9</td></tr><tr><td align="left" valign="top">Fijaoko et al [<xref ref-type="bibr" rid="ref4">4</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">American Heart Association Basic Life Support and Advanced Cardiovascular Life Support exams</td><td align="left" valign="top">126</td><td align="left" valign="top">84%, Failed</td><td align="left" valign="top">GPT-3.5: 64&#x2010;68.4</td></tr><tr><td align="left" valign="top">Su et al [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">China (Taiwan)</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Taiwan's 2022 Nursing Licensing Exam</td><td align="left" valign="top">400</td><td align="left" valign="top">Pass</td><td align="left" valign="top">GPT-3.5: 80.75%</td></tr><tr><td align="left" valign="top">Lewandowski et al [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">Poland</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">The Dermatology Specialty Certificate Examinations</td><td align="left" valign="top">120 &#x00D7; 3</td><td align="left" valign="top">GPT-4 Pass</td><td align="left" valign="top">GPT-4.0: &#x003E;70% better than GPT-3.5</td></tr><tr><td align="left" valign="top">Kung et al [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Orthopaedic In-Training Examination (2020&#x2010;2022)</td><td align="left" valign="top">360</td><td align="left" valign="top">GPT-4.0: &#x003E;PGY<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>-5 level; GPT-3.5: PGY-1 level</td><td align="left" valign="top">GPT-4.0: 73.6; GPT-3.5: 54.3</td></tr><tr><td align="left" valign="top">Gencer and Aydin [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">Turkey</td><td align="left" valign="top">GPT-4.0 and 3.5</td><td align="left" valign="top">Turkish-language thoracic surgery exam</td><td align="left" valign="top">105</td><td align="left" valign="top">Surpass students&#x2019; scores</td><td align="left" valign="top">GPT-4.0: 93.3; GPT-3.5: 90.5</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Not available.</p></fn><fn id="table3fn2"><p><sup>b</sup>PGY: postgraduate year.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4-3"><title>System Role for Accuracy</title><p>While it was expected that introducing system role tailored for clinical subspecialties would enhance the reliability of ChatGPT&#x2019;s medical responses, this effect had not been systematically studied. Our research addressed this gap. Our findings revealed slight but noteworthy improvements in accuracy for both GPT-3.5 (1.3%&#x2010;4.5%) and GPT-4.0 (0.3%&#x2010;3.7%), although these gains were not statistically significant. This might imply that ChatGPT&#x2019;s inherent abilities are already robust enough to discern and address the medical inquiries without narrowing down its response scope.</p></sec><sec id="s4-4"><title>Response Variability</title><p>As an LLM, ChatGPT naturally exhibits variability in responses when the temperature hyperparameter is not zero. In this study, we adopted the default temperature of 0.7 to simulate real-world use conditions on the front end. Our results showed relatively high coherence between the initial and second responses for both GPT-4.0 and GPT-3.5. Therefore, the relatively high temperature of 0.7 is feasible and recommended when testing ChatGPT&#x2019;s performance on the CNMLE. Furthermore, our results highlighted that both model evolution and system roles contribute to ChatGPT&#x2019;s variability in scenarios such as the Chinese Medical Licensing Exams. This variability can be valuable for medical education, as ChatGPT not only provides answers to questions but also includes the rationale and references for its choices, which allows students to easily follow and comprehend [<xref ref-type="bibr" rid="ref16">16</xref>]. Repeatedly submitting questions allows groups or individuals to engage with the explanatory content generated by ChatGPT, which is particularly beneficial for open-ended case scenario discussions [<xref ref-type="bibr" rid="ref17">17</xref>].</p></sec><sec id="s4-5"><title>Subgroup and Multispecialty Analysis</title><p>Our subgroup analysis revealed that ChatGPT demonstrated consistent accuracy across different types of questions. This indicated that ChatGPT was capable of understanding and analyzing complex medical cases and scenarios (A2, A3/A4 questions), which can be challenging even for humans, and making correct decisions. This decision-making ability was equally proficient when addressing more straightforward, common-sense questions that did not require reasoning (A1, B1 questions).</p><p>In comparisons among unit subgroups representing different subspecialties, significant performance variations were observed in GPT-4.0 across CNMLE units. GPT-4.0 exhibited higher accuracy in units 2&#x2010;3, which predominantly featured questions from subspecialties such as cardiovascular, urinary, digestive, and respiratory systems. This was further corroborated by our multispecialty analysis results. GPT-4.0 achieved an accuracy rate of over 75% for these 4 subspecialties, surpassing its overall accuracy rate of 72.7%. Given that these 4 subspecialties accounted for a substantial proportion (34.5%) of all 15 subspecialties, such a disparity might have been advantageous. However, this disparity disappeared upon the introduction of system roles as prompts, with the overall accuracy of GPT-4.0 increasing to 78.6%. This might suggest that the appropriate use of system roles could compensate for individual subspecialty question accuracy, thereby enhancing the overall accuracy of ChatGPT.</p><p>Furthermore, we used CNMLE questions, divided into 15 medical subspecialties, to comprehensively assess the medical expertise of ChatGPT models. This approach provided a robust framework for evaluating model proficiency across a variety of medical fields. Notably, GPT-4.0 surpassed the 60% passing threshold in 14 of the 15 distinct clinical subspecialties, in contrast to GPT-3.5, which only passed in 7 out of 15 subspecialties. This highlighted the superiority of GPT-4.0 and its potential in medical applications.</p></sec><sec id="s4-6"><title>Generalizability of Findings</title><p>Previous studies [<xref ref-type="bibr" rid="ref7">7</xref>] often excluded table and image-based questions when evaluating ChatGPT&#x2019;s performance in medical exams. This approach limited the generalizability of these findings due to ChatGPT&#x2019;s lack of multimodal data processing. In contrast, our study, focusing on the CNMLE&#x2019;s multiple-choice format, which almost exclusively consists of nongraphical and nontabular questions, offers greater generalizability in real exam settings. Zhu et al [<xref ref-type="bibr" rid="ref17">17</xref>] suggested that ChatGPT, as a chatbot, had advantages in responding to open-ended questions, corresponding more closely with real-world scenarios where users sought medical support knowledge from ChatGPT. The potential of ChatGPT in exams with open-ended questions merits further exploration.</p></sec><sec id="s4-7"><title>Limitations</title><p>First, this study assessed ChatGPT&#x2019;s ability to answer questions from the Chinese version of the CNMLE. As ChatGPT is mainly trained on English data, Chinese questions could have underestimated its capabilities. Second, the CNMLE questions were multiple-choice, introducing the chance factor in selecting correct answers. Limited by paper length, we did not evaluate the logic behind ChatGPT&#x2019;s choices, although this aspect is critical and merits deeper investigation. Third, real-world medical questions often have open-ended, multiple, or uncertain answers. Therefore, the CNMLE may not represent the full scope of challenges ChatGPT might face in clinical settings. Consequently, GPT-4.0&#x2019;s success on the CNMLE may only indicate its partial competence in clinical decision-making. Future studies should broaden the range of question types to better assess ChatGPT&#x2019;s medical performance. Despite these limitations, we believe this study provided valuable insights into ChatGPT&#x2019;s capabilities in medicine.</p></sec><sec id="s4-8"><title>Conclusions</title><p>This study comprehensively evaluated the performance of GPT-4.0 and GPT-3.5 in the context of the CNMLE. Our findings indicated that GPT-4.0 not only met the CNMLE passing criteria but also significantly outperformed GPT-3.5 in key areas such as accuracy, consistency, and medical subspecialty expertise. Furthermore, the implementation of system roles served as a pivotal factor in enhancing the model&#x2019;s reliability and answer coherence. These results collectively underscored GPT-4.0&#x2019;s promising potential as a valuable tool for medical professionals, educators, and students, warranting further research and application in the medical field.</p></sec></sec></body><back><ack><p>This research was supported by Medical Science and Technology Tackling Plan of Henan Province (LHGJ20210078).</p></ack><fn-group><fn fn-type="con"><p>SM and BL conceived the study and share the corresponding author. SM, QG, and WC collected all relevant data and assisted in results interpretation. SM designed the study, carried out data analysis, and drafted the manuscript. BL participated in the design and reviewed the manuscript. All authors contributed to the article and approved the submitted version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CNMLE</term><def><p>Chinese National Medical Licensing Examination</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">NMEC</term><def><p>National Medical Examination Center</p></def></def-item><def-item><term id="abb4">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Seghier</surname><given-names>ML</given-names> </name></person-group><article-title>ChatGPT: not all languages are equal</article-title><source>Nature</source><year>2023</year><month>03</month><volume>615</volume><issue>7951</issue><fpage>216</fpage><pub-id pub-id-type="doi">10.1038/d41586-023-00680-3</pub-id><pub-id pub-id-type="medline">36882613</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fija&#x010D;ko</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gosak</surname><given-names>L</given-names> </name><name name-style="western"><surname>&#x0160;tiglic</surname><given-names>G</given-names> </name><name name-style="western"><surname>Picard</surname><given-names>CT</given-names> </name><name name-style="western"><surname>John Douma</surname><given-names>M</given-names> </name></person-group><article-title>Can ChatGPT pass the life support exams without entering the American Heart Association course?</article-title><source>Resuscitation</source><year>2023</year><month>04</month><volume>185</volume><fpage>109732</fpage><pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109732</pub-id><pub-id pub-id-type="medline">36775020</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weng</surname><given-names>TL</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>YM</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>SJ</given-names> </name></person-group><article-title>ChatGPT failed Taiwan&#x2019;s Family Medicine Board Exam</article-title><source>J Chin Med Assoc</source><year>2023</year><month>08</month><day>1</day><volume>86</volume><issue>8</issue><fpage>762</fpage><lpage>766</lpage><pub-id pub-id-type="doi">10.1097/JCMA.0000000000000946</pub-id><pub-id pub-id-type="medline">37294147</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morreel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mathysen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Verhoeven</surname><given-names>V</given-names> </name></person-group><article-title>Aye, AI! ChatGPT passes multiple-choice family medicine exam</article-title><source>Med Teach</source><year>2023</year><month>06</month><day>3</day><volume>45</volume><issue>6</issue><fpage>665</fpage><lpage>666</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2023.2187684</pub-id><pub-id pub-id-type="medline">36905610</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Erabi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name></person-group><article-title>Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: comparison study</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>29</day><volume>9</volume><fpage>e48002</fpage><pub-id pub-id-type="doi">10.2196/48002</pub-id><pub-id pub-id-type="medline">37384388</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>WY</given-names> </name></person-group><article-title>ChatGPT goes to the operating room: evaluating GPT-4 performance and its potential in surgical education and training in the era of large language models</article-title><source>Ann Surg Treat Res</source><year>2023</year><month>05</month><volume>104</volume><issue>5</issue><fpage>269</fpage><lpage>273</lpage><pub-id pub-id-type="doi">10.4174/astr.2023.104.5.269</pub-id><pub-id pub-id-type="medline">37179699</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Currie</surname><given-names>G</given-names> </name><name name-style="western"><surname>Barry</surname><given-names>K</given-names> </name></person-group><article-title>ChatGPT in nuclear medicine education</article-title><source>J Nucl Med Technol</source><year>2023</year><month>09</month><volume>51</volume><issue>3</issue><fpage>247</fpage><lpage>254</lpage><pub-id pub-id-type="doi">10.2967/jnmt.123.265844</pub-id><pub-id pub-id-type="medline">37433676</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT and GPT-4 on neurosurgery written board examinations</article-title><source>Neurosurgery</source><year>2023</year><month>12</month><day>1</day><volume>93</volume><issue>6</issue><fpage>1353</fpage><lpage>1365</lpage><pub-id pub-id-type="doi">10.1227/neu.0000000000002632</pub-id><pub-id pub-id-type="medline">37581444</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Antaki</surname><given-names>F</given-names> </name><name name-style="western"><surname>Touma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Milad</surname><given-names>D</given-names> </name><name name-style="western"><surname>El-Khoury</surname><given-names>J</given-names> </name><name name-style="western"><surname>Duval</surname><given-names>R</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT in ophthalmology: an analysis of its successes and shortcomings</article-title><source>Ophthalmol Sci</source><year>2023</year><month>12</month><volume>3</volume><issue>4</issue><fpage>100324</fpage><pub-id pub-id-type="doi">10.1016/j.xops.2023.100324</pub-id><pub-id pub-id-type="medline">37334036</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Su</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name></person-group><article-title>Assessing question characteristic influences on ChatGPT's performance and response-explanation consistency: Insights from Taiwan's Nursing Licensing Exam</article-title><source>Int J Nurs Stud</source><year>2024</year><month>05</month><volume>153</volume><fpage>104717</fpage><pub-id pub-id-type="doi">10.1016/j.ijnurstu.2024.104717</pub-id><pub-id pub-id-type="medline">38401366</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>K</given-names> </name><name name-style="western"><surname>Barhom</surname><given-names>N</given-names> </name><name name-style="western"><surname>Tamimi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Duggal</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT-a double-edged sword for healthcare education? Implications for assessments of dental students</article-title><source>Eur J Dent Educ</source><year>2024</year><month>02</month><volume>28</volume><issue>1</issue><fpage>206</fpage><lpage>211</lpage><pub-id pub-id-type="doi">10.1111/eje.12937</pub-id><pub-id pub-id-type="medline">37550893</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Holmes</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating large language models on a highly-specialized topic, radiation oncology physics</article-title><source>Front Oncol</source><year>2023</year><month>07</month><volume>13</volume><fpage>1219326</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1219326</pub-id><pub-id pub-id-type="medline">37529688</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>GPT-4</article-title><source>OpenAI</source><access-date>2023-11-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/research/gpt-4/">https://openai.com/research/gpt-4/</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>N</given-names> </name></person-group><article-title>Large language models in health care: development, applications, and challenges</article-title><source>Health Care Sci</source><year>2023</year><month>08</month><volume>2</volume><issue>4</issue><fpage>255</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1002/hcs2.61</pub-id><pub-id pub-id-type="medline">38939520</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name></person-group><article-title>ChatGPT can pass the AHA exams: open-ended questions outperform multiple-choice format</article-title><source>Resuscitation</source><year>2023</year><month>07</month><volume>188</volume><fpage>109783</fpage><pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109783</pub-id><pub-id pub-id-type="medline">37349064</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarraju</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bruemmer</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Iterson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rodriguez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Laffin</surname><given-names>L</given-names> </name></person-group><article-title>Appropriateness of cardiovascular disease prevention recommendations obtained from a popular online chat-based artificial intelligence model</article-title><source>JAMA</source><year>2023</year><month>03</month><day>14</day><volume>329</volume><issue>10</issue><fpage>842</fpage><lpage>844</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.1044</pub-id><pub-id pub-id-type="medline">36735264</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name></person-group><article-title>Can the ChatGPT and other large language models with internet-connected database solve the questions and concerns of patient with prostate cancer and help democratize medical knowledge?</article-title><source>J Transl Med</source><year>2023</year><month>04</month><day>19</day><volume>21</volume><issue>1</issue><fpage>269</fpage><pub-id pub-id-type="doi">10.1186/s12967-023-04123-5</pub-id><pub-id pub-id-type="medline">37076876</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Strong</surname><given-names>E</given-names> </name><name name-style="western"><surname>DiGiammarino</surname><given-names>A</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Chatbot vs medical student performance on free-response clinical reasoning examinations</article-title><source>JAMA Intern Med</source><year>2023</year><month>09</month><day>1</day><volume>183</volume><issue>9</issue><fpage>1028</fpage><lpage>1030</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.2909</pub-id><pub-id pub-id-type="medline">37459090</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>National Clinical Practitioner Qualification Exam: past years&#x2019; real exam papers and detailed solutions [Article in Chinese]</article-title><source>JD</source><year>2022</year><access-date>2023-04-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://item.jd.com/30821733544.html/">https://item.jd.com/30821733544.html/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Introduction of medical licensing examination</article-title><source>The Chinese National Medical Examination Center</source><access-date>2023-11-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www1.nmec.org.cn/Pages/ArticleInfo-13-10706.html/">https://www1.nmec.org.cn/Pages/ArticleInfo-13-10706.html/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>G</given-names> </name><etal/></person-group><article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title><source>J Med Syst</source><year>2023</year><month>08</month><day>15</day><volume>47</volume><issue>1</issue><fpage>86</fpage><pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id><pub-id pub-id-type="medline">37581690</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name></person-group><article-title>Performance and exploration of ChatGPT in medical examination, records and education in Chinese: pave the way for medical AI</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105173</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105173</pub-id><pub-id pub-id-type="medline">37549499</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guerra</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Hofmann</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sobhani</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 artificial intelligence model outperforms ChatGPT, medical students, and neurosurgery residents on neurosurgery written board-like questions</article-title><source>World Neurosurg</source><year>2023</year><month>11</month><volume>179</volume><fpage>e160</fpage><lpage>e165</lpage><pub-id pub-id-type="doi">10.1016/j.wneu.2023.08.042</pub-id><pub-id pub-id-type="medline">37597659</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>LZ</given-names> </name><name name-style="western"><surname>Shaheen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of generative large language models on ophthalmology board&#x2013;style questions</article-title><source>Am J Ophthalmol</source><year>2023</year><month>10</month><volume>254</volume><fpage>141</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1016/j.ajo.2023.05.024</pub-id><pub-id pub-id-type="medline">37339728</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Skalidis</surname><given-names>I</given-names> </name><name name-style="western"><surname>Cagnina</surname><given-names>A</given-names> </name><name name-style="western"><surname>Luangphiphat</surname><given-names>W</given-names> </name><etal/></person-group><article-title>ChatGPT takes on the European exam in core cardiology: an artificial intelligence success story?</article-title><source>Eur Heart J Digit Health</source><year>2023</year><month>05</month><volume>4</volume><issue>3</issue><fpage>279</fpage><lpage>281</lpage><pub-id pub-id-type="doi">10.1093/ehjdh/ztad029</pub-id><pub-id pub-id-type="medline">37265864</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saad</surname><given-names>A</given-names> </name><name name-style="western"><surname>Iyengar</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Kurisunkal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Botchu</surname><given-names>R</given-names> </name></person-group><article-title>Assessing ChatGPT&#x2019;s ability to pass the FRCS orthopaedic part A exam: a critical analysis</article-title><source>Surgeon</source><year>2023</year><month>10</month><volume>21</volume><issue>5</issue><fpage>263</fpage><lpage>266</lpage><pub-id pub-id-type="doi">10.1016/j.surge.2023.07.001</pub-id><pub-id pub-id-type="medline">37517980</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumah-Crystal</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mankowitz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Embi</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lehmann</surname><given-names>CU</given-names> </name></person-group><article-title>ChatGPT and the clinical informatics board examination: the end of unproctored maintenance of certification?</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>08</month><day>18</day><volume>30</volume><issue>9</issue><fpage>1558</fpage><lpage>1560</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad104</pub-id><pub-id pub-id-type="medline">37335851</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mihalache</surname><given-names>A</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Popovic</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Muni</surname><given-names>RH</given-names> </name></person-group><article-title>Performance of an upgraded artificial intelligence chatbot for ophthalmic knowledge assessment</article-title><source>JAMA Ophthalmol</source><year>2023</year><month>08</month><day>1</day><volume>141</volume><issue>8</issue><fpage>798</fpage><lpage>800</lpage><pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.2754</pub-id><pub-id pub-id-type="medline">37440220</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT, GPT-4, and Google Bard on a neurosurgery oral boards preparation question bank</article-title><source>Neurosurgery</source><year>2023</year><month>11</month><day>1</day><volume>93</volume><issue>5</issue><fpage>1090</fpage><lpage>1098</lpage><pub-id pub-id-type="doi">10.1227/neu.0000000000002551</pub-id><pub-id pub-id-type="medline">37306460</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oztermeli</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Oztermeli</surname><given-names>A</given-names> </name></person-group><article-title>ChatGPT performance in the medical specialty exam: an observational study</article-title><source>Medicine (Baltimore)</source><year>2023</year><month>08</month><day>11</day><volume>102</volume><issue>32</issue><fpage>e34673</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000034673</pub-id><pub-id pub-id-type="medline">37565917</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lewandowski</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0141;ukowicz</surname><given-names>P</given-names> </name><name name-style="western"><surname>&#x015A;wietlik</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bara&#x0144;ska-Rybak</surname><given-names>W</given-names> </name></person-group><article-title>An original study of ChatGPT-3.5 and ChatGPT-4 dermatological knowledge level based on the Specialty Certificate Examination in Dermatology</article-title><source>Clin Exp Dermatol</source><year>2024</year><month>06</month><day>25</day><volume>49</volume><issue>7</issue><fpage>686</fpage><lpage>691</lpage><pub-id pub-id-type="doi">10.1093/ced/llad255</pub-id><pub-id pub-id-type="medline">37540015</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gauthier</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gonzalez</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Jackson</surname><given-names>JB</given-names> </name></person-group><article-title>Evaluating ChatGPT performance on the orthopaedic in-training examination</article-title><source>JB JS Open Access</source><year>2023</year><month>09</month><day>8</day><volume>8</volume><issue>3</issue><fpage>e23.00056</fpage><pub-id pub-id-type="doi">10.2106/JBJS.OA.23.00056</pub-id><pub-id pub-id-type="medline">37693092</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gencer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aydin</surname><given-names>S</given-names> </name></person-group><article-title>Can ChatGPT pass the thoracic surgery exam?</article-title><source>Am J Med Sci</source><year>2023</year><month>10</month><volume>366</volume><issue>4</issue><fpage>291</fpage><lpage>295</lpage><pub-id pub-id-type="doi">10.1016/j.amjms.2023.08.001</pub-id><pub-id pub-id-type="medline">37549788</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Question type explanations conveyed to ChatGPT via structured prompts.</p><media xlink:href="mededu_v10i1e52784_app1.docx" xlink:title="DOCX File, 12 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Detail information for repeated responses and their &#x03BA; value under the ChatGPT default temperature of 0.7.</p><media xlink:href="mededu_v10i1e52784_app2.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>