<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e58897</article-id><article-id pub-id-type="doi">10.2196/58897</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of ChatGPT-4 on Taiwanese Traditional Chinese Medicine Licensing Examinations: Cross-Sectional Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Tseng</surname><given-names>Liang-Wei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lu</surname><given-names>Yi-Chin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tseng</surname><given-names>Liang-Chi</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Yu-Chun</given-names></name><degrees>MD, MSc</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Chen</surname><given-names>Hsing-Yu</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff7">7</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Chinese Acupuncture and Traumatology, Center of Traditional Chinese Medicine, Chang Gung Memorial Hospital</institution><addr-line>Taoyuan</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>Division of Chinese Internal Medicine, Center for Traditional Chinese Medicine, Chang Gung Memorial Hospital</institution><addr-line>No. 123, Dinghu Rd, Gueishan Dist</addr-line><addr-line>Taoyuan</addr-line><country>Taiwan</country></aff><aff id="aff3"><institution>Google International LLC Taiwan Branch</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>School of Medicine, Faculty of Medicine, National Yang-Ming Chiao Tung University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff5"><institution>Taipei Veterans General Hospital, Yuli Branch</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff6"><institution>Institute of Hospital and Health Care Administration, National Yang-Ming Chiao Tung University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff7"><institution>School of Traditional Chinese Medicine, College of Medicine, Chang Gung University</institution><addr-line>Taoyuan</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Shen</surname><given-names>Bairong</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kikuchi</surname><given-names>Tomohiro</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hou</surname><given-names>Zhen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hsing-Yu Chen, MD, PhD, Division of Chinese Internal Medicine, Center for Traditional Chinese Medicine, Chang Gung Memorial Hospital, No. 123, Dinghu Rd, Gueishan Dist, Taoyuan, 33378, Taiwan, 886 3-3196200 ext 2611, 886 3-3298995; <email>8705016@cgmh.org.tw</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>3</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e58897</elocation-id><history><date date-type="received"><day>27</day><month>03</month><year>2024</year></date><date date-type="rev-recd"><day>27</day><month>07</month><year>2024</year></date><date date-type="accepted"><day>09</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9; Liang-Wei Tseng, Yi-Chin Lu, Liang-Chi Tseng, Yu-Chun Chen, Hsing-Yu Chen. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 19.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e58897"/><abstract><sec><title>Background</title><p>The integration of artificial intelligence (AI), notably ChatGPT, into medical education, has shown promising results in various medical fields. Nevertheless, its efficacy in traditional Chinese medicine (TCM) examinations remains understudied.</p></sec><sec><title>Objective</title><p>This study aims to (1) assess the performance of ChatGPT on the TCM licensing examination in Taiwan and (2) evaluate the model&#x2019;s explainability in answering TCM-related questions to determine its suitability as a TCM learning tool.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used the GPT-4 model to respond to 480 questions from the 2022 TCM licensing examination. This study compared the performance of the model against that of licensed TCM doctors using 2 approaches, namely direct answer selection and provision of explanations before answer selection. The accuracy and consistency of AI-generated responses were analyzed. Moreover, a breakdown of question characteristics was performed based on the cognitive level, depth of knowledge, types of questions, vignette style, and polarity of questions.</p></sec><sec sec-type="results"><title>Results</title><p>ChatGPT achieved an overall accuracy of 43.9%, which was lower than that of 2 human participants (70% and 78.4%). The analysis did not reveal a significant correlation between the accuracy of the model and the characteristics of the questions. An in-depth examination indicated that errors predominantly resulted from a misunderstanding of TCM concepts (55.3%), emphasizing the limitations of the model with regard to its TCM knowledge base and reasoning capability.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Although ChatGPT shows promise as an educational tool, its current performance on TCM licensing examinations is lacking. This highlights the need for enhancing AI models with specialized TCM training and suggests a cautious approach to utilizing AI for TCM education. Future research should focus on model improvement and the development of tailored educational applications to support TCM learning.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI language understanding tools</kwd><kwd>ChatGPT</kwd><kwd>natural language processing</kwd><kwd>machine learning</kwd><kwd>Chinese medicine license exam</kwd><kwd>Chinese medical licensing examination</kwd><kwd>medical education</kwd><kwd>traditional Chinese medicine</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Traditional Chinese medicine (TCM), recognizeartid as one of the most renowned traditional medical systems, boasts a history spanning thousands of years. In the modern era, TCM has evolved to form an integral part of the formal health care system in East Asian countries, particularly in China and Taiwan [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. TCM encompasses a wealth of theoretical knowledge and features unique diagnostic and treatment methods, such as acupuncture and herbal therapy. As a highly practical discipline, TCM learning traditionally relies on the accumulation of experience and the mentorship inherent in the master-apprentice system; hence, this education model may not be sufficiently reliable or comprehensive. However, with the emerging need for integrative medicine over time, TCM has been integrated into the modern medical education system. This integration has led to prominent changes in educational approaches. The incorporation of TCM into academic institutions resulted in the establishment of formal examination systems. For instance, in Taiwan, TCM practitioners must pass a biannual licensing examination, termed the National Senior Professional and Technical Examinations for Chinese Medicine Practitioners (hereinafter called the &#x201C;TCM licensing examinations&#x201D;), to practice as a licensed TCM doctor, similar to their Western medicine counterparts [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>The advancements in technology and the development of artificial intelligence (AI) have begun to impact and challenge the medical field, with TCM being no exception [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. In the past year, significant progress has been made in AI language models, particularly those based on the generative pretrained transformer (GPT) architecture. ChatGPT, a conversational variant of the GPT model, has demonstrated its potential across various domains [<xref ref-type="bibr" rid="ref6">6</xref>]. Recognized for its foundational medical knowledge and conversational capabilities, ChatGPT is considered a valuable tool in medical education, aiding in the understanding and application of medical knowledge [<xref ref-type="bibr" rid="ref7">7</xref>], thereby facilitating student learning [<xref ref-type="bibr" rid="ref8">8</xref>]. However, its responses are not consistently reliable. Unlike humans who answer questions based on an understanding of the content, it generates replies by drawing from a vast database. Therefore, although it can produce human-like conversations and respond to inquiries, it cannot guarantee the accuracy of its responses [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Discussions have emerged regarding the sufficiency of AI for clinical decision-making and basic medical consultation [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. In addition, to be a potential mentor for medical students, one benchmark is the ability of AI to pass national licensing examinations (the minimum standard for practicing physicians). Thus, the application of ChatGPT in medical examinations has opened a new research direction. Studies have shown that GPT models, especially GPT-4, can achieve commendable scores on a variety of standardized tests for multiple professions, such as physicians [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>], pharmacists [<xref ref-type="bibr" rid="ref15">15</xref>], and nurses [<xref ref-type="bibr" rid="ref16">16</xref>]. This success in examination settings has sparked interest in the potential of ChatGPT as a self-learning tool, suggesting its use for examination preparation and knowledge enhancement [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>As previously mentioned, while TCM is a traditional medical system distinct from modern medicine, it has been integrated into modern medical education systems and subjected to formal examinations. The question arises: does ChatGPT possess the requisite knowledge level to assist TCM students in their learning? Only 1 study examined GPT&#x2019;s ability to answer TCM questions, but it focused on questions sourced from online TCM texts rather than formally recognized examination questions and utilized older GPT models (GPT-3 Turbo) [<xref ref-type="bibr" rid="ref18">18</xref>]. In contrast, a more rigorous study on traditional Korean medicine found that, due to the unique nature of traditional medicine, GPT models require specially optimized prompts, such as language-related adjustments, to pass examinations [<xref ref-type="bibr" rid="ref19">19</xref>]. However, considering the classical Chinese language barrier and different medical theories in TCM, whether GPT models would face challenges in TCM licensing examinations remains unexplored.</p><p>The aim of this study is to evaluate whether ChatGPT can accurately understand and respond to TCM questions by assessing its performance in simulated examination environments. By analyzing the accuracy of AI-generated answers, we sought to identify factors affecting their correctness. This study also aims to understand the consistency between AI-generated answers and their accompanying explanations, offering insights into the depth of understanding of this model. By analyzing the performance of ChatGPT in simulated TCM licensing examinations and comparing it with human performance, this study hopes to provide new insights and recommendations for innovation and development in TCM education.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows the data processing flowchart of this study. The feasibility of using ChatGPT (GPT-4 model, with a knowledge cutoff date of September 2021), developed by OpenAI, with 2 different prompts on responding to the first National Senior Professional and Technical Examinations for Chinese Medicine Practitioners was assessed by comparing the responses of the model to those of licensed TCM resident doctors. A total of 480 questions from the 2022 examination were inputted into ChatGPT, and 2 different approaches were used to obtain responses from ChatGPT. The first step involved prompting AI to select the correct answer directly from the question options. The second step required ChatGPT to explain why each option was correct or incorrect before selecting the correct answer. For the second step, individual answers and explanations from ChatGPT were manually assessed for accuracy and consistency. Subsequently, accuracy was measured by comparing the AI-selected answers with the correct answers. Additionally, the performance of AI was benchmarked against that of human experts. Two individual TCM resident doctors took the same examination without preparation, and their answers were also evaluated for accuracy. Finally, consistency was evaluated by comparing explanations against a standard set of answers for logical coherence, and the reasons for inconsistency were also verified by the 2 TCM doctors.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of this study. GPT: generative pretrained transformer; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig01.png"/></fig></sec><sec id="s2-2"><title>The TCM Licensing Examination in Taiwan</title><p>In Taiwan, TCM doctors are qualified through 2 stages of licensing examinations after graduation from their TCM course at the university. The contents and answers are freely downloadable after each examination from the following website [<xref ref-type="bibr" rid="ref20">20</xref>]. The examinations contain 2 stages corresponding to 10 subjects. The first stage consists of basic theory, including &#x9EC3;&#x5E1D;&#x5167;&#x7D93; (Huangdi Neijing), &#x96E3;&#x7D93; (Nanjing) (domain I), and basic pharmacology and formulation (domain II). The second stage consists of principles of diagnosis and treatment, including &#x50B7;&#x5BD2;&#x8AD6; (Shanghanlun) and &#x91D1;&#x5331;&#x8981;&#x7565; (Jinguiyaolue) (domain III), TCM internal medicine (domain IV), TCM gynecology and obstetrics (domain IV), TCM pediatrics (domain IV), TCM dermatology (domain V), TCM otorhinolaryngology (domain V, including questions regarding the specialty concerning ears, nose, and throat [ENT]) and ophthalmology (domain V), TCM traumatology (domain V), and acupuncture (domain VI). Each domain contains 80 multiple-choice questions with single answers. The full score of each domain is 100. The examination score is calculated by dividing the total score by the number of subjects. Only examinees obtaining average scores &#x2265;60 pass the examination. TCM students are eligible to take the first-stage examination when they have earned the requisite fourth-year university credits. Before the second-stage examination, TCM students must first pass the first-stage examination and graduate from the 7-year university course.</p></sec><sec id="s2-3"><title>Question Characteristics</title><p>A total of 5 factors were used to characterize the examination questions, including the cognitive level, depth of knowledge (DOK), type of questions, vignette style, and polarity of questions (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). LWT and YCL independently reviewed and classified all questions according to the definitions of these 5 factors. In case of disagreement, HYC was consulted, and the disagreement was resolved by reaching a consensus among all authors. Bloom&#x2019;s taxonomy was modified to classify the questions into lower-order thinking skills (LOTS) and higher-order thinking skills (HOTS). LOTS include remembering, understanding, and applying knowledge to questions, while HOTS include further analyzing, evaluating, and creating after learning [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. For the DOK, 3 levels, ranging from low to high based on Webb&#x2019;s framework on science, were defined as recall, concept, and strategic thinking. Questions with higher levels of DOK indicate the recruitment of sophisticated thinking [<xref ref-type="bibr" rid="ref23">23</xref>]. Furthermore, the licensing examinations in Taiwan are presented as single-choice questions, adhering to the 1 stem, 4 choices policy. However, 2 types of questions were used to add variety to examination questions, including single-answer multiple-choice (SAMC) and single-answer, multiple-response multiple-choice (SAMRMC) questions. SAMC questions had only 1 most appropriate answer, while SAMRMC questions require the tester to choose the most appropriate answer composed of multiple correct options provided in each question (Table S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Moreover, if the content of a question presents clinical scenarios, this question would be categorized as the clinical vignette type. This type of question typically aims to examine the ability of the tester to analyze the clinical conditions and corresponding actions. The polarity of a question depended on whether the question was positively or negatively framed. A &#x201C;positive-choice question&#x201D; solicits the correct or affirmative answer, whereas a &#x201C;negative choice question&#x201D; demands the identification of the incorrect or negative answer.</p></sec><sec id="s2-4"><title>Prompt for AI-Generated Answers</title><p>To enhance the precision and brevity of responses obtained from ChatGPT (GPT-4 model), we strategically added &#x201C;think step-by-step&#x201D; to our queries. This approach aimed to guide the model toward a methodical and sequential problem-solving process. Subsequently, by integrating the command &#x201C;but show me only the answer, do not explain it,&#x201D; we aimed to extract a more refined and consolidated answer, significantly boosting the response accuracy of the model. An example of a prompt with response is demonstrated in Table S3 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. We created a collection of unique prompts derived from an equal number of questions in the question database, submitting them sequentially to the AI model. To solve the issue of memory retention between submissions, we used a specialized application designed to initiate separate application programming interface requests for each prompt. This approach guaranteed that each application programming interface interaction would be initiated separately. This ensures that the processing of each prompt and the generation of its answer were conducted in isolation, thereby preserving the integrity of the responses without interference from a prior response [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s2-5"><title>Prompt for Explanations Provided By AI Through Step-By-Step and Human-Curated Answers</title><p>Furthermore, to understand the thinking process of GPT and evaluate the accuracy of its interpretation of our inquiries, we prompted ChatGPT to &#x201C;explain each item&#x201D; for each question. This prompt directed the AI to furnish exhaustive explanations for each item [<xref ref-type="bibr" rid="ref26">26</xref>] (Table S4 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). LWT and YCL reviewed all explanations to items and reached decisive responses based on AI-generated explanations. This process was termed &#x201C;human-curated responses.&#x201D; To authentically represent the logic of AI, we refrained from making any human amendments, even if the explanations provided by AI were incorrect. The answer would be marked as &#x201C;wrong&#x201D; if the AI-generated explanations were incorrect.</p></sec><sec id="s2-6"><title>Outcome Assessment</title><p>We evaluated the accuracy of answers generated by the GPT, those made by humans, and explanations provided by the GPT and curated by humans. This was achieved by calculating the ratio of accurate responses to the total number of questions and representing the results as a percentage. This measure of accuracy underwent comparative analysis across different attributes of the questions. The human-curated answers, which encapsulated the interpretation of questions by AI, were evaluated by LWT, YCL, and HYC, who reached a consensus to identify instances of misinterpretation of the question (GPT cannot understand the question and does not provide an answer), misunderstanding of concepts (GPT can understand the question, but lacks knowledge of the topic), and incorrect application of principles (the responses GPT provides are correct in general but fail to answer the question).</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Proportions and percentages were used to present categorical data. A logistic regression approach was adopted to assess the effect of various attributes of questions on the correctness of responses generated by GPT-4. The cognitive complexity of the questions, their structural format, the inclusion of clinical vignettes, the overall polarity of questions, and the subjects were used as covariates in the logistic regression with univariable and multivariable models. The influence of each variable on the probability of the AI producing accurate answers was quantified using the adjusted odds ratio, accompanied by 95% CIs. Additionally, the &#x03BA; statistic was used to evaluate the agreement between responses generated by GPT and curated by humans. This represented the different viewpoints concerning the same explanation between GPT and humans. <italic>P</italic>&#x003C;.05 was used as the threshold for statistical significance. All statistical evaluation was performed utilizing Stata 17 (StataCorp LLC).</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study did not require ethical approval, as it analyzed data obtained from a publicly available database. The test questions and answers used were originally created and copyrighted by the Taiwan Ministry of Examination and made accessible for academic research purposes. The Ministry retains full copyright over the examination content and confirmed that this research adhered to copyright regulations without any infringement.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Question Characteristics</title><p>The examination encompassed a total of 480 questions spanning 10 specialties. Four image-related questions were excluded. Our findings indicated that most questions were HOTS, SAMC, negative-choice, and without a clinical vignette. According to Bloom&#x2019;s taxonomy of cognitive learning, the majority of questions across all subjects required HOTS (263/476, 55.3%; LOTS: 213/476, 44.7%). In particular, principles of diagnosis and treatment, TCM internal medicine, TCM dermatology, and TCM traumatology predominantly featured HOTS (58/80, 72.5%; 37/48, 77.1%; 13/19, 68.4%; and 17/20, 85%, respectively), while TCM pediatrics mainly involved LOTS (11/16, 68.8%). Within the LOTS category, &#x201C;remembering&#x201D; was the most common type (121/213, 56.8%), while &#x201C;analyzing&#x201D; dominated the HOTS category (255/263, 97%). In terms of Webb&#x2019;s DOK analysis of question types, the basic application of skill/concept represented the largest proportion (248/476, 52.1%), surpassing recall (85/476, 17.9%) and strategic thinking (143/476, 30%). A large portion of the questions were formatted as SAMC (439/476, 92.2%). Negative-choice questions comprised 62.2% (296/476) of the total, while 23.9% (180/476) of the questions included a clinical vignette (<xref ref-type="table" rid="table1">Table 1</xref>, <xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of TCM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> licensing examinations in Taiwan, 2022.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cognitive level</td><td align="left" valign="bottom">Total (n=476)</td><td align="left" valign="bottom">Basic theory<break/>(n=80)</td><td align="left" valign="bottom">Basic pharmacology and formulation (n=80)</td><td align="left" valign="bottom">Principle of diagnosis and treatment (n=80)</td><td align="left" valign="bottom">TCM internal medicine (n=48)</td><td align="left" valign="bottom">TCM GYN/OBS<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (n=16)</td><td align="left" valign="bottom">TCM pediatrics (n=16)</td><td align="left" valign="bottom">TCM dermatology (n=19)</td><td align="left" valign="bottom">TCM ENT, ophthalmology (n=37)</td><td align="left" valign="bottom">TCM traumatology (n=20)</td><td align="left" valign="bottom">TCM acupuncture (n=80)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="12"><bold>LOTS</bold><sup><xref ref-type="table-fn" rid="table1fn3"><bold>c</bold></xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Remembering</td><td align="left" valign="top">121 (25.4)</td><td align="left" valign="top">28 (35)</td><td align="left" valign="top">19 (23.8)</td><td align="left" valign="top">7 (8.8)</td><td align="left" valign="top">1 (2.1)</td><td align="left" valign="top">6 (37.5)</td><td align="left" valign="top">8 (50)</td><td align="left" valign="top">5 (26.3)</td><td align="left" valign="top">13 (35.1)</td><td align="left" valign="top">3 (15)</td><td align="left" valign="top">31 (38.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Understanding</td><td align="left" valign="top">41 (8.6)</td><td align="left" valign="top">10 (12.5)</td><td align="left" valign="top">10 (12.5)</td><td align="left" valign="top">7 (8.8)</td><td align="left" valign="top">2 (4.2)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (13.5)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">7 (8.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Applying</td><td align="left" valign="top">51 (10.7)</td><td align="left" valign="top">7 (8.8)</td><td align="left" valign="top">9 (11.3)</td><td align="left" valign="top">8 (10)</td><td align="left" valign="top">8 (16.7)</td><td align="left" valign="top">3 (18.8)</td><td align="left" valign="top">3 (18.8)</td><td align="left" valign="top">1 (5.3)</td><td align="left" valign="top">4 (10.8)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">8 (10)</td></tr><tr><td align="left" valign="top" colspan="12"><bold>HOTS</bold><sup><xref ref-type="table-fn" rid="table1fn4"><bold>d</bold></xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Analyzing</td><td align="left" valign="top">255 (53.6)</td><td align="left" valign="top">34 (42.5)</td><td align="left" valign="top">40 (50)</td><td align="left" valign="top">54 (67.5)</td><td align="left" valign="top">37 (77.1)</td><td align="left" valign="top">7 (43.8)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">13 (68.4)</td><td align="left" valign="top">15 (40.5)</td><td align="left" valign="top">17 (85)</td><td align="left" valign="top">33 (41.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Evaluating</td><td align="left" valign="top">8 (1.7)</td><td align="left" valign="top">1 (1.3)</td><td align="left" valign="top">2 (2.5)</td><td align="left" valign="top">4 (5.0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (1.3)</td></tr><tr><td align="left" valign="top" colspan="12"><bold>Depth of knowledge</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Recall</td><td align="left" valign="top">85 (17.9)</td><td align="left" valign="top">20 (25)</td><td align="left" valign="top">18 (22.5)</td><td align="left" valign="top">6 (7.5)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">4 (25)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">3 (15.8)</td><td align="left" valign="top">3 (8.1)</td><td align="left" valign="top">4 (20)</td><td align="left" valign="top">22 (27.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Basic application of skill/concept</td><td align="left" valign="top">248 (52.1)</td><td align="left" valign="top">34 (42.5)</td><td align="left" valign="top">44 (55)</td><td align="left" valign="top">44 (55)</td><td align="left" valign="top">28 (58.3)</td><td align="left" valign="top">7 (43.8)</td><td align="left" valign="top">6 (37.5)</td><td align="left" valign="top">8 (42.1)</td><td align="left" valign="top">25 (67.6)</td><td align="left" valign="top">8 (40)</td><td align="left" valign="top">44 (55)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Strategic thinking</td><td align="left" valign="top">143 (30)</td><td align="left" valign="top">26 (32.5)</td><td align="left" valign="top">18 (22.5)</td><td align="left" valign="top">30 (37.5)</td><td align="left" valign="top">20 (41.7)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">8 (42.1)</td><td align="left" valign="top">9 (24.3)</td><td align="left" valign="top">8 (40)</td><td align="left" valign="top">14 (17.5)</td></tr><tr><td align="left" valign="top" colspan="12"><bold>Type of question options and choices</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SAMC<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">439 (92.2)</td><td align="left" valign="top">78 (97.5)</td><td align="left" valign="top">76 (95)</td><td align="left" valign="top">75 (93.8)</td><td align="left" valign="top">48 (100)</td><td align="left" valign="top">11 (68.8)</td><td align="left" valign="top">13 (81.3)</td><td align="left" valign="top">19 (100)</td><td align="left" valign="top">30 (81.1)</td><td align="left" valign="top">20 (100)</td><td align="left" valign="top">69 (86.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SAMRMC<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">37 (7.8)</td><td align="left" valign="top">2 (2.5)</td><td align="left" valign="top">4 (5)</td><td align="left" valign="top">5 (6.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">3 (18.8)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">7 (18.9)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">11 (13.8)</td></tr><tr><td align="left" valign="top" colspan="12"><bold>Clinical vignette</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Without clinical vignette</td><td align="left" valign="top">362 (76.1)</td><td align="left" valign="top">63 (78.8)</td><td align="left" valign="top">63 (78.8)</td><td align="left" valign="top">61 (76.3)</td><td align="left" valign="top">22 (45.8)</td><td align="left" valign="top">7 (43.8)</td><td align="left" valign="top">14 (87.5)</td><td align="left" valign="top">13 (68.4)</td><td align="left" valign="top">29 (78.4)</td><td align="left" valign="top">16 (80)</td><td align="left" valign="top">74 (92.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>With clinical vignette</td><td align="left" valign="top">114 (23.9)</td><td align="left" valign="top">17 (21.3)</td><td align="left" valign="top">17 (21.3)</td><td align="left" valign="top">19 (23.8)</td><td align="left" valign="top">26 (54.2)</td><td align="left" valign="top">9 (56.3)</td><td align="left" valign="top">2 (12.5)</td><td align="left" valign="top">6 (31.6)</td><td align="left" valign="top">8 (21.6)</td><td align="left" valign="top">4 (20)</td><td align="left" valign="top">6 (7.5)</td></tr><tr><td align="left" valign="top" colspan="12"><bold>Polarity of question options</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Positive</td><td align="left" valign="top">180 (37.8)</td><td align="left" valign="top">22 (27.5)</td><td align="left" valign="top">27 (33.8)</td><td align="left" valign="top">36 (45)</td><td align="left" valign="top">21 (43.8)</td><td align="left" valign="top">3 (18.8)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">9 (47.4)</td><td align="left" valign="top">8 (21.6)</td><td align="left" valign="top">13 (65)</td><td align="left" valign="top">36 (45)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Negative</td><td align="left" valign="top">296 (62.2)</td><td align="left" valign="top">58 (72.5)</td><td align="left" valign="top">53 (66.3)</td><td align="left" valign="top">44 (55)</td><td align="left" valign="top">27 (56.3)</td><td align="left" valign="top">13 (81.3)</td><td align="left" valign="top">11 (68.8)</td><td align="left" valign="top">10 (52.6)</td><td align="left" valign="top">29 (78.4)</td><td align="left" valign="top">7 (35)</td><td align="left" valign="top">44 (55)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>TCM: traditional Chinese medicine.</p></fn><fn id="table1fn2"><p><sup>b</sup>GYN/OBS: gynecology/obstetrics.</p></fn><fn id="table1fn3"><p><sup>c</sup>LOTS: lower-order thinking skills.</p></fn><fn id="table1fn4"><p><sup>d</sup>HOTS: higher-order thinking skills.</p></fn><fn id="table1fn5"><p><sup>e</sup>SAMC: single-answer multiple-choice.</p></fn><fn id="table1fn6"><p><sup>f</sup>SAMRMC: single-answer, multiple-response multiple-choice.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Distribution of subjects in TCM licensing examinations. The detailed numbers and proportion of each subject&#x2019;s question types can be seen in <xref ref-type="table" rid="table1">Table 1</xref>. ENT: ears, nose, and throat; GYN/OBS: gynecology/obstetrics; HOTS: higher-order thinking skills; LOTS: lower-order thinking skills; SAMC: single-answer multiple-choice; SAMRMC: single-answer, multiple-response multiple-choice; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Analysis of question types according to Bloom&#x2019;s cognitive level in TCM licensing examinations. ENT: ears, nose, and throat; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig03.png"/></fig></sec><sec id="s3-2"><title>GPT-4 Model Performance and Accuracy Across Different Question Characteristics</title><p>We observed that the performance of the GPT-4 model was inferior to that of humans and did not demonstrate significant variation across different categories of examination questions. The GPT-4 model demonstrated an overall accuracy of only 43.9% (209/476). In comparison, 2 human evaluators achieved accuracy rates of 70% (333/476) and 78.4% (373/476), respectively (<xref ref-type="table" rid="table2">Table 2</xref>). The performance of ChatGPT across various variables is shown in <xref ref-type="table" rid="table3">Table 3</xref>. The accuracy of AI-generated answers did not show a significant correlation with the characteristics of the questions, regardless of the classification method used (<xref ref-type="fig" rid="figure4">Figure 4</xref>). The GPT-4 model demonstrated a performance close to that of humans in TCM dermatology and TCM traumatology. The accuracy of AI-generated answers varied among the test subjects, ranging from 31.3% in TCM pediatrics to 73.7% in TCM dermatology. Notably, only TCM internal medicine (adjusted odds ratio [aOR] 3.07, 95% CI 1.41&#x2010;6.68; <italic>P</italic>=.005), TCM dermatology (aOR 5.11, 95% CI 1.65&#x2010;15.85; <italic>P</italic>=.005), and TCM acupuncture (aOR 2.14, 95% CI 1.12&#x2010;4.11; <italic>P</italic>=.02) showed statistically significant better performance (<xref ref-type="fig" rid="figure4">Figure 4</xref>). On the other hand, GPT had a higher, but not statistically significant, accuracy rate for questions categorized as LOTS (96/213, 45.1%), SAMC (197/439, 44.9%), strategic thinking (66/143, 46.2%), with clinical vignette (52/114, 45.6%), and positive-choice (85/180, 47.2%).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Accuracy rates of testers and ChatGPT-4 for TCM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> licensing examinations.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Number of questions</td><td align="left" valign="bottom">Number of correct responses</td><td align="left" valign="bottom">Accuracy, %</td></tr></thead><tbody><tr><td align="left" valign="top">Human-made 1</td><td align="char" char="." valign="top">476</td><td align="char" char="." valign="top">333</td><td align="char" char="." valign="top">70</td></tr><tr><td align="left" valign="top">Human-made 2</td><td align="char" char="." valign="top">476</td><td align="char" char="." valign="top">373</td><td align="char" char="." valign="top">78.4</td></tr><tr><td align="left" valign="top">ChatGPT-4<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">476</td><td align="char" char="." valign="top">209</td><td align="char" char="." valign="top">43.9</td></tr><tr><td align="left" valign="top">Human-curated answer 1</td><td align="char" char="." valign="top">476</td><td align="char" char="." valign="top">192</td><td align="char" char="." valign="top">40.3</td></tr><tr><td align="left" valign="top">Human-curated answer 2</td><td align="char" char="." valign="top">476</td><td align="char" char="." valign="top">186</td><td align="char" char="." valign="top">39.1</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>TCM: traditional Chinese medicine.</p></fn><fn id="table2fn2"><p><sup>b</sup>ChatGPT did not show answers to 7 questions although an explanation was provided.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Accuracy rates of testers and ChatGPT-4 across different types and subjects of questions.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="5">Accuracy, %</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Human-made 1</td><td align="left" valign="bottom">Human-made 2</td><td align="left" valign="bottom">ChatGPT-4</td><td align="left" valign="bottom">Human-curated 1</td><td align="left" valign="bottom">Human-curated 2</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6"><bold>Bloom&#x2019;s cognitive level</bold></td></tr><tr><td align="left" valign="top">&#x2003;LOTS<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">150 (70.4)</td><td align="left" valign="top">164 (77)</td><td align="left" valign="top">96 (45.1)</td><td align="left" valign="top">78 (36.6)</td><td align="left" valign="top">75 (35.2)</td></tr><tr><td align="left" valign="top">&#x2003;HOTS<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">183 (69.6)</td><td align="left" valign="top">209 (79.5)</td><td align="left" valign="top">113 (43)</td><td align="left" valign="top">114 (43.3)</td><td align="left" valign="top">111 (42.2)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Depth of knowledge</bold></td></tr><tr><td align="left" valign="top">&#x2003;Recall</td><td align="left" valign="top">57 (67.1)</td><td align="left" valign="top">65 (76.5)</td><td align="left" valign="top">34 (40)</td><td align="left" valign="top">27 (31.8)</td><td align="left" valign="top">22 (25.9)</td></tr><tr><td align="left" valign="top">&#x2003;Basic application of skill/concept</td><td align="left" valign="top">172 (69.4)</td><td align="left" valign="top">193 (77.8)</td><td align="left" valign="top">109 (44)</td><td align="left" valign="top">103 (41.5)</td><td align="left" valign="top">102 (41.1)</td></tr><tr><td align="left" valign="top">&#x2003;Strategic thinking</td><td align="left" valign="top">104 (72.7)</td><td align="left" valign="top">115 (80.4)</td><td align="left" valign="top">66 (46.2)</td><td align="left" valign="top">62 (43.4)</td><td align="left" valign="top">62 (43.4)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Type of questions</bold></td></tr><tr><td align="left" valign="top">&#x2003;SAMC<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">312 (71.1)</td><td align="left" valign="top">346 (78.8)</td><td align="left" valign="top">197 (44.9)</td><td align="left" valign="top">180 (41)</td><td align="left" valign="top">176 (40.1)</td></tr><tr><td align="left" valign="top">&#x2003;SAMRMC<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">21 (56.8)</td><td align="left" valign="top">27 (73)</td><td align="left" valign="top">12 (32.4)</td><td align="left" valign="top">12 (32.4)</td><td align="left" valign="top">10 (27)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Vignette style question</bold></td></tr><tr><td align="left" valign="top">&#x2003;Without clinical vignette</td><td align="left" valign="top">248 (68.5)</td><td align="left" valign="top">283 (78.2)</td><td align="left" valign="top">157 (43.4)</td><td align="left" valign="top">143 (39.5)</td><td align="left" valign="top">137 (37.8)</td></tr><tr><td align="left" valign="top">&#x2003;With clinical vignette</td><td align="left" valign="top">85 (74.6)</td><td align="left" valign="top">90 (78.9)</td><td align="left" valign="top">52 (45.6)</td><td align="left" valign="top">49 (43)</td><td align="left" valign="top">49 (43)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Polarity of question</bold></td></tr><tr><td align="left" valign="top">&#x2003;Positive</td><td align="left" valign="top">129 (71.7)</td><td align="left" valign="top">142 (78.9)</td><td align="left" valign="top">85 (47.2)</td><td align="left" valign="top">78 (43.3)</td><td align="left" valign="top">76 (42.2)</td></tr><tr><td align="left" valign="top">&#x2003;Negative</td><td align="left" valign="top">204 (68.9)</td><td align="left" valign="top">231 (78)</td><td align="left" valign="top">124 (41.9)</td><td align="left" valign="top">114 (38.5)</td><td align="left" valign="top">110 (37.2)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Subjects</bold></td></tr><tr><td align="left" valign="top">&#x2003;Basic theory</td><td align="left" valign="top">51 (63.7)</td><td align="left" valign="top">63 (78.8)</td><td align="left" valign="top">29 (36.3)</td><td align="left" valign="top">29 (36.3)</td><td align="left" valign="top">28 (35)</td></tr><tr><td align="left" valign="top">&#x2003;Basic pharmacology and formulation</td><td align="left" valign="top">63 (78.8)</td><td align="left" valign="top">66 (82.5)</td><td align="left" valign="top">30 (37.5)</td><td align="left" valign="top">32 (40)</td><td align="left" valign="top">28 (35)</td></tr><tr><td align="left" valign="top">&#x2003;Principle of diagnosis and treatment</td><td align="left" valign="top">57 (71.3)</td><td align="left" valign="top">58 (72.5)</td><td align="left" valign="top">29 (36.3)</td><td align="left" valign="top">29 (36.3)</td><td align="left" valign="top">29 (36.3)</td></tr><tr><td align="left" valign="top">&#x2003;TCM<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup> internal medicine</td><td align="left" valign="top">41 (85.4)</td><td align="left" valign="top">44 (91.7)</td><td align="left" valign="top">30 (62.5)</td><td align="left" valign="top">24 (50)</td><td align="left" valign="top">24 (50)</td></tr><tr><td align="left" valign="top">&#x2003;TCM gynecology and obstetrics</td><td align="left" valign="top">10 (62.5)</td><td align="left" valign="top">12 (75)</td><td align="left" valign="top">8 (50)</td><td align="left" valign="top">4 (25)</td><td align="left" valign="top">4 (25)</td></tr><tr><td align="left" valign="top">&#x2003;TCM pediatrics</td><td align="left" valign="top">11 (68.8)</td><td align="left" valign="top">13 (81.3)</td><td align="left" valign="top">5 (31.3)</td><td align="left" valign="top">7 (43.8)</td><td align="left" valign="top">7 (43.8)</td></tr><tr><td align="left" valign="top">&#x2003;TCM dermatology</td><td align="left" valign="top">14 (73.7)</td><td align="left" valign="top">17 (89.5)</td><td align="left" valign="top">14 (73.7)</td><td align="left" valign="top">12 (63.2)</td><td align="left" valign="top">12 (63.2)</td></tr><tr><td align="left" valign="top">&#x2003;TCM ENT<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup>, ophthalmology</td><td align="left" valign="top">21 (56.8)</td><td align="left" valign="top">26 (70.3)</td><td align="left" valign="top">12 (32.4)</td><td align="left" valign="top">13 (35.1)</td><td align="left" valign="top">13 (35.1)</td></tr><tr><td align="left" valign="top">&#x2003;TCM traumatology</td><td align="left" valign="top">9 (45)</td><td align="left" valign="top">14 (70)</td><td align="left" valign="top">9 (45)</td><td align="left" valign="top">8 (40)</td><td align="left" valign="top">8 (40)</td></tr><tr><td align="left" valign="top">&#x2003;TCM acupuncture</td><td align="left" valign="top">56 (70)</td><td align="left" valign="top">60 (75)</td><td align="left" valign="top">43 (53.8)</td><td align="left" valign="top">34 (42.5)</td><td align="left" valign="top">33 (41.3)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>LOTS: lower-order thinking skills.</p></fn><fn id="table3fn2"><p><sup>b</sup>HOTS: higher-order thinking skills.</p></fn><fn id="table3fn3"><p><sup>c</sup>SAMC: single-answer multiple-choice.</p></fn><fn id="table3fn4"><p><sup>d</sup>SAMRMC: single-answer, multiple-response multiple-choice.</p></fn><fn id="table3fn5"><p><sup>e</sup>TCM: traditional Chinese medicine.</p></fn><fn id="table3fn6"><p><sup>f</sup>ENT: ears, nose, and throat.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Factors associated with correct answers provided by ChatGPT-4. aOR: adjusted odds ratio; ENT: ears, nose, and throat; GYN/OBS: gynecology/obstetrics; HOTS: higher-order thinking skills; LOTS: lower-order thinking skills; SAMC: single-answer multiple-choice; SAMRMC: single-answer, multiple-response multiple-choice; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig04.png"/></fig></sec><sec id="s3-3"><title>Consistency Between AI-Generated Answers and Human-Curated Answers and Analysis of Incorrect Responses Provided by the GPT-4 Model</title><p>The consistency between AI-generated and human-curated results was low (&#x03BA;=0.504; <xref ref-type="fig" rid="figure5">Figure 5</xref>). After human review, the accuracy of the human-curated answers showed an overall trend of slight decrease, except for some minor increases in basic pharmacology and formulation, TCM pediatrics, and TCM otorhinolaryngology and ophthalmology. The accuracies for the remaining specialties were slightly lower, ranging from 43.9% to 40.3% (<xref ref-type="table" rid="table2">Table 2</xref>, <xref ref-type="fig" rid="figure5">Figures 5</xref> and <xref ref-type="fig" rid="figure6">6</xref>). For human reviewer 1, discrepancies were observed between AI-generated responses and those reviewed by humans, with 23.96% (115 of 480 questions) of the answers provided by AI conflicting with its own explanations. For 33% of correctly answered questions (69 of 209 questions), the AI provided an incorrect explanation, indicating a scenario of &#x201C;correct answer, incorrect explanation.&#x201D; Conversely, for 17% of incorrectly answered questions (46 of 267 questions), the AI provided a correct explanation, suggesting a case of &#x201C;incorrect answer, correct explanation.&#x201D; This reduced the overall accuracy of the AI model to 43.9%.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Accuracy rates of humans and ChatGPT-4 for TCM licensing examinations. The passing standard is an average score of 60. With 476 questions, the threshold is at least 286 correct answers (red dashed line). AI: artificial intelligence; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Performance of humans and ChatGPT-4 across various subjects. ENT: ears, nose, and throat; GYN/OBS: gynecology/obstetrics; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig06.png"/></fig><p>We further analyzed the reasons responsible for the incorrect answers provided by the GPT. For this purpose, we categorized the potential reasons for these errors into 3 types: misinterpretation of the question (failing to understand the question), misunderstanding of concepts (lacking knowledge of the topic), and incorrect application of principles (the content is correct, but it does not answer the question). The results revealed that most of the errors (263/476, 55.3%) were attributed to the misunderstanding of concepts (<xref ref-type="table" rid="table4">Table 4</xref>, <xref ref-type="fig" rid="figure7">Figure 7</xref>). However, a closer examination of the different characteristics of the questions indicated that misunderstanding of concepts was more common in LOTS, recall, and SAMRMC compared to their counterparts. The second most common cause of error was incorrect application of principles (20/476, 4.2%), followed by misinterpretation of questions (7/476, 1.5%).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Reasons responsible for incorrect artificial intelligence&#x2013;generated responses (human-curated).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Correct (n=186)</td><td align="left" valign="bottom">Misinterpretation of the question (n=7)</td><td align="left" valign="bottom">Misunderstanding of concepts (n=263)</td><td align="left" valign="bottom">Incorrect application of principles (n=20)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5"><bold>Bloom&#x2019;s cognitive level</bold></td><td align="char" char="." valign="top">.25</td></tr><tr><td align="left" valign="top">&#x2003;LOTS<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="char" char="." valign="top">75 (40.3)</td><td align="char" char="." valign="top">5 (71.4)</td><td align="char" char="." valign="top">124 (47.1)</td><td align="char" char="." valign="top">9 (45)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;HOTS<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">111 (59.7)</td><td align="char" char="." valign="top">2 (28.6)</td><td align="char" char="." valign="top">139 (52.9)</td><td align="char" char="." valign="top">11 (55)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5"><bold>Depth of knowledge</bold></td><td align="char" char="." valign="top">.06</td></tr><tr><td align="left" valign="top">&#x2003;Recall</td><td align="char" char="." valign="top">22 (11.8)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">60 (22.8)</td><td align="char" char="." valign="top">3 (15)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Basic application of skill/concept</td><td align="char" char="." valign="top">102 (54.8)</td><td align="char" char="." valign="top">5 (71.4)</td><td align="char" char="." valign="top">132 (50.2)</td><td align="char" char="." valign="top">9 (45)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Strategic thinking</td><td align="char" char="." valign="top">62 (33.3)</td><td align="char" char="." valign="top">2 (28.6)</td><td align="char" char="." valign="top">71 (27)</td><td align="char" char="." valign="top">8 (40)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5"><bold>Type of questions</bold></td><td align="char" char="." valign="top">.16</td></tr><tr><td align="left" valign="top">&#x2003;SAMC<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="char" char="." valign="top">176 (94.6)</td><td align="char" char="." valign="top">6 (85.7)</td><td align="char" char="." valign="top">237 (90.1)</td><td align="char" char="." valign="top">20 (100)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;SAMRMC<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="char" char="." valign="top">10 (5.4)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">26 (9.9)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5"><bold>Vignette style question</bold></td><td align="char" char="." valign="top">.39</td></tr><tr><td align="left" valign="top">&#x2003;Without clinical vignette</td><td align="char" char="." valign="top">137 (73.7)</td><td align="char" char="." valign="top">6 (85.7)</td><td align="char" char="." valign="top">206 (78.3)</td><td align="char" char="." valign="top">13 (65)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;With clinical vignette</td><td align="char" char="." valign="top">49 (26.3)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">57 (21.7)</td><td align="char" char="." valign="top">7 (35)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5"><bold>Polarity of question</bold></td><td align="char" char="." valign="top">.28</td></tr><tr><td align="left" valign="top">&#x2003;Positive</td><td align="char" char="." valign="top">76 (40.9)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">98 (37.3)</td><td align="char" char="." valign="top">5 (25)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Negative</td><td align="char" char="." valign="top">110 (59.1)</td><td align="char" char="." valign="top">6 (85.7)</td><td align="char" char="." valign="top">165 (62.7)</td><td align="char" char="." valign="top">15 (75)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5"><bold>Subjects</bold></td><td align="char" char="." valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Basic theory</td><td align="char" char="." valign="top">28 (15.1)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">35 (13.3)</td><td align="char" char="." valign="top">17 (85)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Basic pharmacology and formulation</td><td align="char" char="." valign="top">28 (15.1)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">49 (18.6)</td><td align="char" char="." valign="top">2 (10)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Principle of diagnosis and treatment</td><td align="char" char="." valign="top">29 (15.6)</td><td align="char" char="." valign="top">3 (42.9)</td><td align="char" char="." valign="top">48 (18.3)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> internal medicine</td><td align="char" char="." valign="top">24 (12.9)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">24 (9.1)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM gynecology and obstetrics</td><td align="char" char="." valign="top">4 (2.2)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">12 (4.6)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM pediatrics</td><td align="char" char="." valign="top">7 (3.8)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">8 (3.0)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM dermatology</td><td align="char" char="." valign="top">12 (6.5)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">7 (2.7)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM ENT<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup>, ophthalmology</td><td align="char" char="." valign="top">13 (7)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">22 (8.4)</td><td align="char" char="." valign="top">1 (5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM traumatology</td><td align="char" char="." valign="top">8 (4.3)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">12 (4.6)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;TCM acupuncture</td><td align="char" char="." valign="top">33 (17.7)</td><td align="char" char="." valign="top">1 (14.3)</td><td align="char" char="." valign="top">46 (17.5)</td><td align="char" char="." valign="top">0 (0)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>LOTS: lower-order thinking skills.</p></fn><fn id="table4fn2"><p><sup>b</sup>HOTS: higher-order thinking skills.</p></fn><fn id="table4fn3"><p><sup>c</sup>SAMC: single-answer multiple-choice.</p></fn><fn id="table4fn4"><p><sup>d</sup>SAMRMC: single-answer, multiple-response multiple-choice.</p></fn><fn id="table4fn5"><p><sup>e</sup>TCM: traditional Chinese medicine.</p></fn><fn id="table4fn6"><p><sup>f</sup>ENT: ears, nose, and throat.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Distribution of reasons for incorrect answers provided by ChatGPT-4. ENT: ears, nose, and throat; GYN/OBS: gynecology/obstetrics; HOTS: higher-order thinking skills; LOTS: lower-order thinking skills; SAMC: single-answer multiple-choice; SAMRMC: single-answer, multiple-response multiple-choice; TCM: traditional Chinese medicine.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e58897_fig07.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Performance of ChatGPT in Medical Examinations</title><p>This is the first study to test the capabilities of ChatGPT in TCM examinations. ChatGPT has undergone rigorous testing for its proficiency in medical examinations. Nonetheless, its effectiveness in TCM licensing examinations remains unexplored. Hence, this study fills a research void by examining the capability of an advanced language model like ChatGPT in the context of TCM. Generally, most studies indicate ChatGPT can meet the medical examination pass standards. For example, ChatGPT 3.5 scored around the pass mark on the United States Medical Licensing Examination [<xref ref-type="bibr" rid="ref14">14</xref>] and exhibited strong performance in specialties such as radiation oncology and neurosurgery [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. GPT-4 surpassed 70% in its score for UK medical licensing examinations [<xref ref-type="bibr" rid="ref12">12</xref>], and its competency extends to examinations in different languages. For example, GPT 3.5 typically scored around the passing mark on the Japanese nursing examinations [<xref ref-type="bibr" rid="ref16">16</xref>] and Korean medical student parasitology examinations [<xref ref-type="bibr" rid="ref29">29</xref>]. Although GPT-3.5 Turbo is not yet capable, GPT-4 passed the medical licensing examinations of China [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>] and achieved 88.6% accuracy in the equivalent examinations of Saudi Arabia [<xref ref-type="bibr" rid="ref32">32</xref>]. Interestingly, it even outperformed human residents in the residency training examinations of Japan [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>Published research has identified 2 trends in this setting. First, GPT-4 surpasses GPT-3.5 in identical medical examinations, as demonstrated in medical student finals in Poland [<xref ref-type="bibr" rid="ref34">34</xref>] and the medical licensing examinations of Peru [<xref ref-type="bibr" rid="ref35">35</xref>]. A systematic review and meta-analysis of ChatGPT use in medical licensing examinations worldwide observed similar results [<xref ref-type="bibr" rid="ref36">36</xref>]. Second, ChatGPT models showed higher accuracy when answering questions translated into English compared with the original language [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. In Taiwan, traditional Chinese is the language used for medical licensing examinations. Despite this disadvantage, ChatGPT performed near the pass threshold for the nursing [<xref ref-type="bibr" rid="ref38">38</xref>] and pharmacy licensing examinations in Taiwan [<xref ref-type="bibr" rid="ref15">15</xref>]; translating pharmacy examination questions into English indeed improved scores across all subjects [<xref ref-type="bibr" rid="ref15">15</xref>]. Thus, it was hypothesized that GPT-4 would perform similarly in TCM licensing examinations. However, the results were surprising. The study used the first 2022 TCM licensing examinations in Taiwan as a case study to assess the performance of the model. GPT-4 failed the exam with an overall accuracy of 43.9%; following human revision of AI-provided explanations, the accuracy further decreased to 40.3% (human 1) and 39.1% (human 2). These results underscore the need for further research and development on the application of AI models to TCM examination preparation and highlight the existing knowledge gap. The reasons behind these outcomes merit further investigation.</p></sec><sec id="s4-2"><title>Challenges Encountered by ChatGPT When Answering Medical Questions</title><p>Previous literature has discussed the shortcomings and challenges of ChatGPT in answering examination questions, including a decreased proficiency in languages other than English [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], AI &#x201C;hallucinations&#x201D; originating from erroneous data [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref38">38</xref>], and proficiency limited to certain types of questions [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. The tendency for ChatGPT to be less proficient in answering questions posed in languages other than English stems from the fact that ChatGPT is an LLM trained primarily on English language data, which includes a wide variety of sources such as books, websites, and news articles [<xref ref-type="bibr" rid="ref6">6</xref>]. The questions for TCM licensing examinations are not presented in English. Although ChatGPT can fluently interact in traditional Chinese, its responses to medical examination questions, which require specific expertise and have standard answers, may reveal its inadequacies. AI &#x201C;hallucinations&#x201D; indicate a tendency to produce &#x201C;hallucinations&#x201D; or factually incorrect content due to incorrect data. This poses the risk of generating misleading or fabricated information, which complicates the use of AI as a reliable self-learning tool [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. We also encountered seemingly plausible but incorrect content in AI-generated responses in our research. We even found that verifying the authenticity of these answers is more time-consuming and requires deeper professional knowledge than the questions themselves. Our study also showed that ChatGPT had higher, albeit not statistically significant, accuracy rates for questions posed such as SAMC (n=197, 44.9%) and presented with clinical vignettes (n=52, 45.6%). This trend aligns with findings of previous studies, such as a lower proficiency in multiple-choice questions [<xref ref-type="bibr" rid="ref13">13</xref>] and a poorer aptitude for conceptual questions compared with clinical scenarios [<xref ref-type="bibr" rid="ref39">39</xref>]. Despite these limitations, which we have also encountered, other research has shown that ChatGPT can pass examinations. Therefore, the use of ChatGPT in the context of TCM may pose its own unique set of challenges and necessitates further investigation.</p></sec><sec id="s4-3"><title>Challenges Encountered by ChatGPT When Answering TCM Examination Questions</title><p>We identified 3 main reasons for incorrect answers according to AI-generated responses, namely misinterpretation of the question, misunderstanding of concepts, and incorrect application of principles. Misunderstanding of concepts was the most prevalent, especially in questions with lower cognitive demand such as recall and LOTS, as well as in questions where a single item encompasses multiple questions (eg, SAMRMC), indicating either a lack of knowledge or incorrect knowledge. We believe that this primarily stems from 2 factors. First, the database for TCM is currently incomplete. Second, compared with Western medicine, TCM is often considered alternative medicine. If an LLM such as ChatGPT answers questions based solely on the Western medical knowledge system, then TCM content may be ignored. Additionally, TCM focuses on personalized treatment without a golden standard, leading to the absence of definitive answers for the same disease.</p><p>The incomplete TCM database is due to challenges such as insufficient data, lack of standardization, and unrepresentative data sources. Although the specific TCM data that ChatGPT uses for training are unclear, it is evident that the current online data for TCM are significantly less comprehensive than those for Western medicine. For instance, a bibliometric analysis over the past 20 years did not show a significant presence of TCM-related keywords in the context of pediatric allergic rhinitis [<xref ref-type="bibr" rid="ref40">40</xref>]. However, the usage rate of TCM for allergic diseases in Taiwan is approximately 30%&#x2010;50% [<xref ref-type="bibr" rid="ref41">41</xref>]. Therefore, a model constructed based on such a database is likely to exhibit discrepancies with reality. Furthermore, online data often contain inaccuracies or incomplete information. Previous research has shown that uncleaned training texts can affect performance and could underpin the subpar performance of the trained model [<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>It is important to note that, due to challenges in translation and cultural appropriation, certain medical terms have different connotations in the TCM and Western medical systems. However, ChatGPT tends to interpret these terms with a preference for their meanings within Western medicine. For instance, in some AI-generated responses, the TCM term for &#x201C;&#x809D;&#x201D; was mistakenly translated and described as the physical organ &#x201C;liver&#x201D; in Western medicine. Similarly, the term for &#x201C;&#x7627;&#x201D; in TCM was translated and described as &#x201C;malaria&#x201D; in some AI-generated responses. The understanding of &#x201C;&#x809D;&#x201D; in TCM is not entirely the same as in modern medicine, and &#x201C;&#x7627;&#x201D; in TCM refers to a broad category of symptoms similar to malaria but not restricted to infections caused by <italic>Plasmodium</italic>.</p><p>The crux of TCM is personalized treatment, which is antithetical to gold-standard treatments. Hence, multiple therapeutic approaches may exist for the same disease. If the examination questions do not specify a particular scope or clear criteria, there may be no standard answer or multiple possible solutions. This study revealed that the decrease in the overall accuracy rate after human review was primarily driven by a reduction in accuracy for LOTS questions, whereas the accuracy rate for HOTS remained stable or even increased. Regarding DOK, the decrease in accuracy following human review was primarily in recall, with less of a decrease noted in more advanced DOK (eg, basic application of skill/concept, strategic thinking). This suggests that GPT-4 is more adept at providing detailed explanations for complex logical reasoning questions, as opposed to simple memorization, which might be influenced by incorrect information. In addition, if users intend to use GPT to answer TCM questions, they should be particularly cautious of potential hallucinations in lower cognitive demand questions.</p><p>Our study revealed that the GPT-4 model is currently unable to pass the TCM licensing examinations. This research underscores the limitations of the performance of AI in TCM licensing examinations, as well as illuminates broader challenges within the realm of integrating TCM knowledge into AI development.</p></sec><sec id="s4-4"><title>Limitations</title><p>Although this study provides valuable insights into the use of the GPT-4 model for TCM licensing examination preparation, several limitations have been identified. The focus solely on the GPT-4 model of ChatGPT might neglect the complexities and potential capabilities of other recently developed AI-driven language models, such as Claude 3 by Anthropic, Bard (Gemini Pro) by Google, or LLaMa2 by Facebook. Notably, we did not use expert-level AI, such as Med-PaLM by Google [<xref ref-type="bibr" rid="ref43">43</xref>]. Moreover, we did not use other traditional Chinese-language LLMs, such as Taiwan-LLM [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. Nevertheless, GPT models are the most widely used and studied models, and it is necessary to use the same tool to facilitate comparisons with other research studies [<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>Considering the cultural context specific to the TCM licensing examination of Taiwan, the generalizability of our findings to different regions or educational systems may be limited. Notably, model performance may change over time, indicating that our results may not be replicated in the future. This study also did not account for potential inconsistencies in responses provided by ChatGPT to identical queries during different sessions. However, this issue could be minimized by explicitly setting the parameters of ChatGPT.</p><p>Additionally, the difficulty of each exam can vary, which might affect ChatGPT&#x2019;s performance. However, the difficulty is generally controlled and, as a national exam, the pass rates have been stable over the years [<xref ref-type="bibr" rid="ref46">46</xref>]. Previous exam questions could potentially be part of the GPT model&#x2019;s training data (with a knowledge cutoff date of September 2021), introducing bias. Therefore, we only used the first exam of 2022 to mitigate this issue.</p></sec><sec id="s4-5"><title>Implications for Practice and Future Research</title><p>This study investigated the use of the GPT-4 model for TCM licensing examination preparation. The findings revealed that AI-driven tools are not yet valuable assets for TCM educators and students. The observed limitations (ie, often providing responses based on incorrect facts) highlight the need for further development before this model can be effectively used as a self-learning tool. As the AI field continues to advance with the introduction of new models, educators must stay informed and utilize the most effective tools while being cognizant of their limitations. This study sets the stage for 2 potential research directions. In terms of TCM, considering the suboptimal examination results, we speculate that the primary drawback lies in the quality of the front-end data. Future improvements may include incorporating ancient TCM texts and customizing training for LLMs.</p><p>We must deliberately incorporate relevant resources into our training database materials, such as textbooks on TCM in Chinese and ancient TCM texts. Currently, the majority of descriptions and knowledge regarding TCM are in Chinese. When these data are published in journals or translated into English, they often adopt the framework and language of modern medicine as a medium for knowledge transmission. This approach tends to underemphasize the original content of TCM, which is mostly documented in Chinese literature. Therefore, the inclusion of TCM materials in LLM training and the standardization of TCM should be targeted for improvement.</p><p>Tailoring training data for LLMs presents another promising avenue for improvement. TCM comprises different schools, suggesting that narrowing the knowledge domain could be more advantageous. Hence, to excel in TCM, developing specialized ChatGPT models or custom LLMs might be a beneficial strategy. Considering the current limitations in enhancing the database, integrating specific prompts offers an alternative solution. For example, the chain-of-thoughts method, used in LLMs for complex problem-solving, articulates intermediate steps in reasoning. This approach is particularly effective for models with extensive parameters, enhancing their ability to manage multistep tasks [<xref ref-type="bibr" rid="ref26">26</xref>]. It has been confirmed that this method can also improve the performance of ChatGPT in medical examinations [<xref ref-type="bibr" rid="ref47">47</xref>]. Hence, the adoption of chain-of-thoughts may be a viable strategy to address the complexity of TCM examinations. Additionally, previous research indicated that restricting ChatGPT to a single response in a Basic Life Support examination may introduce bias. When ChatGPT generates 3 responses per question, it successfully passes the examination. Moreover, rephrasing incorrectly answered questions as open-ended questions significantly boosts the accuracy of ChatGPT. This implies that open-ended questioning or multiple inquiries might be more effective than single-choice formats [<xref ref-type="bibr" rid="ref48">48</xref>].</p></sec><sec id="s4-6"><title>Conclusion</title><p>Our study represents the first comprehensive assessment of the performance of ChatGPT in TCM licensing examinations. Despite advances in AI and its success in various medical licensing tests, ChatGPT demonstrated a limited ability to accurately respond to TCM examination questions, achieving an overall accuracy rate significantly lower than that of its human counterparts. This shortfall underscores the challenges posed by the unique concepts and terminologies of TCM, highlighting a significant knowledge gap in the understanding of TCM principles by AI. Our findings call for further advancements in AI training, specifically tailored toward the intricate domain of TCM, to enhance its utility in this specialized field of medicine.</p></sec></sec></body><back><ack><p>This study was partially supported by Chang Gung Medical Foundation (grant CGRPG1Q0011), the Ministry of Health and Welfare (grants MOHW112-CMAP-M-113-000006-D, MOHW113-CMAP-M-113-000002-D, and MOHW113-CMAP-M-113-000003-B), and the National Science and Technology Council in Taiwan (grant MOST111-2320-B-182-035-MY3).</p></ack><fn-group><fn fn-type="con"><p>LWT contributed to manuscript writing. HYC and YCC were responsible for the statistical analysis, project administration, funding acquisition, manuscript revision, and study design. Results were interpreted by LCT and YCL. HYC and YCC contributed equally as co-corresponding authors.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">aOR</term><def><p>adjusted odds ratio</p></def></def-item><def-item><term id="abb3">DOK</term><def><p>depth of knowledge</p></def></def-item><def-item><term id="abb4">ENT</term><def><p>ears, nose, and throat</p></def></def-item><def-item><term id="abb5">GPT</term><def><p>generative pretrained transformer</p></def></def-item><def-item><term id="abb6">GYN/OBS</term><def><p>gynecology/obstetrics</p></def></def-item><def-item><term id="abb7">HOTS</term><def><p>higher-order thinking skills</p></def></def-item><def-item><term id="abb8">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb9">LOTS</term><def><p>lower-order thinking skills</p></def></def-item><def-item><term id="abb10">SAMC</term><def><p>single-answer multiple-choice</p></def></def-item><def-item><term id="abb11">SAMRMC</term><def><p>single-answer, multiple-response multiple-choice</p></def></def-item><def-item><term id="abb12">TCM</term><def><p>traditional Chinese medicine</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chi</surname><given-names>C</given-names> </name></person-group><article-title>Integrating traditional medicine into modern health care systems: examining the role of Chinese medicine in Taiwan</article-title><source>Soc Sci Med</source><year>1994</year><month>08</month><volume>39</volume><issue>3</issue><fpage>307</fpage><lpage>321</lpage><pub-id pub-id-type="doi">10.1016/0277-9536(94)90127-9</pub-id><pub-id pub-id-type="medline">7939847</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>SC</given-names> </name></person-group><article-title>The practice of Chinese medicine in Taiwan</article-title><source>Soc Sci Med</source><year>1996</year><month>11</month><volume>43</volume><issue>9</issue><fpage>1329</fpage><lpage>1348</lpage><pub-id pub-id-type="doi">10.1016/0277-9536(95)00429-7</pub-id><pub-id pub-id-type="medline">8913003</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>YL</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Sasaki</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Park</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>SG</given-names> </name></person-group><article-title>Comparative study on the education system of traditional medicine in China, Japan, Korea, and Taiwan</article-title><source>Expl NY</source><year>2016</year><volume>12</volume><issue>5</issue><fpage>375</fpage><lpage>383</lpage><pub-id pub-id-type="doi">10.1016/j.explore.2016.06.004</pub-id><pub-id pub-id-type="medline">27546589</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Efferth</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shang</surname><given-names>D</given-names> </name></person-group><article-title>The impact of artificial intelligence on traditional Chinese medicine</article-title><source>Am J Chin Med</source><year>2021</year><volume>49</volume><issue>6</issue><fpage>1297</fpage><lpage>1314</lpage><pub-id pub-id-type="doi">10.1142/S0192415X21500622</pub-id><pub-id pub-id-type="medline">34247564</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhai</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>L</given-names> </name></person-group><article-title>Opportunities and challenges of traditional Chinese medicine doctors in the era of artificial intelligence</article-title><source>Front Med (Lausanne)</source><year>2023</year><volume>10</volume><fpage>1336175</fpage><pub-id pub-id-type="doi">10.3389/fmed.2023.1336175</pub-id><pub-id pub-id-type="medline">38274445</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><year>2020</year><conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>N Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name></person-group><article-title>Analysis of the effect of an artificial intelligence chatbot educational program on non-face-to-face classes: a quasi-experimental study</article-title><source>BMC Med Educ</source><year>2022</year><month>12</month><day>1</day><volume>22</volume><issue>1</issue><fpage>830</fpage><pub-id pub-id-type="doi">10.1186/s12909-022-03898-3</pub-id><pub-id pub-id-type="medline">36457086</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Branum</surname><given-names>C</given-names> </name><name name-style="western"><surname>Schiavenato</surname><given-names>M</given-names> </name></person-group><article-title>Can ChatGPT accurately answer a PICOT question? Assessing AI response to a clinical question</article-title><source>Nurse Educ</source><year>2023</year><volume>48</volume><issue>5</issue><fpage>231</fpage><lpage>233</lpage><pub-id pub-id-type="doi">10.1097/NNE.0000000000001436</pub-id><pub-id pub-id-type="medline">37130197</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Borji</surname><given-names>A</given-names> </name></person-group><article-title>A categorical archive of ChatGPT failures</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 6, 2023</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-2895792/v1</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dave</surname><given-names>T</given-names> </name><name name-style="western"><surname>Athaluri</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations</article-title><source>Front Artif Intell</source><year>2023</year><volume>6</volume><fpage>1169595</fpage><pub-id pub-id-type="doi">10.3389/frai.2023.1169595</pub-id><pub-id pub-id-type="medline">37215063</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lai</surname><given-names>UH</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Hsu</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Kan</surname><given-names>JKC</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT-4 on the United Kingdom Medical Licensing Assessment</article-title><source>Front Med (Lausanne)</source><year>2023</year><volume>10</volume><fpage>1240915</fpage><pub-id pub-id-type="doi">10.3389/fmed.2023.1240915</pub-id><pub-id pub-id-type="medline">37795422</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alfertshofer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Funk</surname><given-names>PF</given-names> </name><etal/></person-group><article-title>Sailing the seven seas: a multinational comparison of ChatGPT&#x2019;s performance on medical licensing examinations</article-title><source>Ann Biomed Eng</source><year>2024</year><month>06</month><volume>52</volume><issue>6</issue><fpage>1542</fpage><lpage>1545</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03338-3</pub-id><pub-id pub-id-type="medline">37553555</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Dig Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>YM</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>TJ</given-names> </name></person-group><article-title>Performance of ChatGPT on the pharmacist licensing examination in Taiwan</article-title><source>J Chin Med Assoc</source><year>2023</year><month>07</month><day>1</day><volume>86</volume><issue>7</issue><fpage>653</fpage><lpage>658</lpage><pub-id pub-id-type="doi">10.1097/JCMA.0000000000000942</pub-id><pub-id pub-id-type="medline">37227901</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taira</surname><given-names>K</given-names> </name><name name-style="western"><surname>Itaya</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hanada</surname><given-names>A</given-names> </name></person-group><article-title>Performance of the large language model ChatGPT on the National Nurse Examinations in Japan: evaluation study</article-title><source>JMIR Nurs</source><year>2023</year><month>06</month><day>27</day><volume>6</volume><fpage>e47305</fpage><pub-id pub-id-type="doi">10.2196/47305</pub-id><pub-id pub-id-type="medline">37368470</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name></person-group><article-title>Performance and exploration of ChatGPT in medical examination, records and education in Chinese: pave the way for medical AI</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105173</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105173</pub-id><pub-id pub-id-type="medline">37549499</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Yizhen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Shaohan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jiaxing</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Dongran</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhongzhi</surname><given-names>L</given-names> </name></person-group><article-title>Exploring the comprehension of ChatGPT in Traditional Chinese Medicine knowledge</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 14, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.09164</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yun</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>YK</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>CE</given-names> </name></person-group><article-title>GPT-4 can pass the Korean National Licensing Examination for Korean Medicine Doctors</article-title><source>PLOS Dig Health</source><year>2023</year><month>12</month><volume>2</volume><issue>12</issue><fpage>e0000416</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000416</pub-id><pub-id pub-id-type="medline">38100393</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><source>Taiwan Ministry of Examination - Examination Question Inquiry Platform [Website in Mandarin]</source><access-date>2025-01-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://wwwq.moex.gov.tw/exam/wFrmExamQandASearch.aspx">https://wwwq.moex.gov.tw/exam/wFrmExamQandASearch.aspx</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaidi</surname><given-names>NB</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>S</given-names> </name><name name-style="western"><surname>Stallard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Purkiss</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hortsch</surname><given-names>M</given-names> </name></person-group><article-title>Climbing Bloom&#x2019;s taxonomy pyramid: lessons from a graduate histology course</article-title><source>Anat Sci Ed</source><year>2017</year><month>09</month><volume>10</volume><issue>5</issue><fpage>456</fpage><lpage>464</lpage><pub-id pub-id-type="doi">10.1002/ase.1685</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krathwohl</surname><given-names>DR</given-names> </name></person-group><article-title>A revision of Bloom&#x2019;s taxonomy: an overview</article-title><source>Theor Pract</source><year>2002</year><month>11</month><day>1</day><volume>41</volume><issue>4</issue><fpage>212</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1207/s15430421tip4104_2</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Webb</surname><given-names>NL</given-names> </name></person-group><article-title>Depth-of-knowledge levels for four content areas</article-title><source>LA</source><year>2002</year><volume>28</volume><issue>March</issue><fpage>1</fpage><lpage>9</lpage></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giannos</surname><given-names>P</given-names> </name><name name-style="western"><surname>Delardas</surname><given-names>O</given-names> </name></person-group><article-title>Performance of ChatGPT on UK standardized admission tests: insights from the BMAT, TMUA, LNAT, and TSA examinations</article-title><source>JMIR Med Educ</source><year>2023</year><month>04</month><day>26</day><volume>9</volume><fpage>e47737</fpage><pub-id pub-id-type="doi">10.2196/47737</pub-id><pub-id pub-id-type="medline">37099373</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><year>2022</year><conf-name>36th Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, LA, USA</conf-loc><fpage>24824</fpage><lpage>24837</lpage></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gomaa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Semrau</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Benchmarking ChatGPT-4 on a radiation oncology in-training exam and Red Journal Gray Zone cases: potentials and challenges for ai-assisted medical education and decision making in radiation oncology</article-title><source>Front Oncol</source><year>2023</year><volume>13</volume><fpage>1265024</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1265024</pub-id><pub-id pub-id-type="medline">37790756</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT and GPT-4 on neurosurgery written board examinations</article-title><source>Neurosurgery</source><year>2023</year><month>12</month><day>1</day><volume>93</volume><issue>6</issue><fpage>1353</fpage><lpage>1365</lpage><pub-id pub-id-type="doi">10.1227/neu.0000000000002632</pub-id><pub-id pub-id-type="medline">37581444</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huh</surname><given-names>S</given-names> </name></person-group><article-title>Are ChatGPT&#x2019;s knowledge and interpretation ability comparable to those of medical students in Korea for taking a parasitology examination?: a descriptive study</article-title><source>J Educ Eval Health Prof</source><year>2023</year><volume>20</volume><fpage>1</fpage><pub-id pub-id-type="doi">10.3352/jeehp.2023.20.1</pub-id><pub-id pub-id-type="medline">36627845</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>W</given-names> </name><etal/></person-group><article-title>How does ChatGPT-4 preform on non-English national medical licensing examination? An evaluation in Chinese language</article-title><source>PLOS Dig Health</source><year>2023</year><month>12</month><volume>2</volume><issue>12</issue><fpage>e0000397</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000397</pub-id><pub-id pub-id-type="medline">38039286</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>B</given-names> </name></person-group><article-title>Performance of ChatGPT on Chinese national medical licensing examinations: a five-year examination evaluation study for physicians, pharmacists and nurses</article-title><source>BMC Med Educ</source><year>2024</year><month>02</month><day>14</day><volume>24</volume><issue>1</issue><fpage>143</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05125-7</pub-id><pub-id pub-id-type="medline">38355517</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aljindan</surname><given-names>FK</given-names> </name><name name-style="western"><surname>Al Qurashi</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Albalawi</surname><given-names>IAS</given-names> </name><etal/></person-group><article-title>ChatGPT conquers the Saudi Medical Licensing Exam: exploring the accuracy of artificial intelligence in medical knowledge assessment and implications for modern medical education</article-title><source>Cureus</source><year>2023</year><month>09</month><volume>15</volume><issue>9</issue><fpage>e45043</fpage><pub-id pub-id-type="doi">10.7759/cureus.45043</pub-id><pub-id pub-id-type="medline">37829968</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Performance comparison of ChatGPT-4 and Japanese medical residents in the General Medicine In-Training Examination: comparison study</article-title><source>JMIR Med Educ</source><year>2023</year><month>12</month><day>6</day><volume>9</volume><fpage>e52202</fpage><pub-id pub-id-type="doi">10.2196/52202</pub-id><pub-id pub-id-type="medline">38055323</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roso&#x0142;</surname><given-names>M</given-names> </name><name name-style="western"><surname>G&#x0105;sior</surname><given-names>JS</given-names> </name><name name-style="western"><surname>&#x0141;aba</surname><given-names>J</given-names> </name><name name-style="western"><surname>Korzeniewski</surname><given-names>K</given-names> </name><name name-style="western"><surname>M&#x0142;y&#x0144;czak</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish Medical Final Examination</article-title><source>Sci Rep</source><year>2023</year><month>11</month><day>22</day><volume>13</volume><issue>1</issue><fpage>20512</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-46995-z</pub-id><pub-id pub-id-type="medline">37993519</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flores-Cohaila</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Garc&#x00ED;a-Vicente</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vizcarra-Jim&#x00E9;nez</surname><given-names>SF</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on the Peruvian National Licensing Medical Examination: cross-sectional study</article-title><source>JMIR Med Educ</source><year>2023</year><month>09</month><day>28</day><volume>9</volume><fpage>e48039</fpage><pub-id pub-id-type="doi">10.2196/48039</pub-id><pub-id pub-id-type="medline">37768724</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT across different versions in medical licensing examinations worldwide: systematic review and meta-analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>25</day><volume>26</volume><fpage>e60807</fpage><pub-id pub-id-type="doi">10.2196/60807</pub-id><pub-id pub-id-type="medline">39052324</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>G</given-names> </name><etal/></person-group><article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title><source>J Med Syst</source><year>2023</year><month>08</month><day>15</day><volume>47</volume><issue>1</issue><fpage>86</fpage><pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id><pub-id pub-id-type="medline">37581690</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>H</given-names> </name></person-group><article-title>Performance of ChatGPT on Registered Nurse License Exam in Taiwan: a descriptive study</article-title><source>Healthcare (Basel)</source><year>2023</year><month>10</month><day>30</day><volume>11</volume><issue>21</issue><fpage>2855</fpage><pub-id pub-id-type="doi">10.3390/healthcare11212855</pub-id><pub-id pub-id-type="medline">37958000</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scaioli</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lo Moro</surname><given-names>G</given-names> </name><name name-style="western"><surname>Conrado</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rosset</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bert</surname><given-names>F</given-names> </name><name name-style="western"><surname>Siliquini</surname><given-names>R</given-names> </name></person-group><article-title>Exploring the potential of ChatGPT for clinical reasoning and decision-making: a cross-sectional study on the Italian Medical Residency Exam</article-title><source>Ann Ist Super Sanita</source><year>2023</year><volume>59</volume><issue>4</issue><fpage>267</fpage><lpage>270</lpage><pub-id pub-id-type="doi">10.4415/ANN_23_04_05</pub-id><pub-id pub-id-type="medline">38088393</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name></person-group><article-title>Visual analysis of allergic rhinitis in children based on web of science and CiteSpace software</article-title><source>Front Pediatr</source><year>2022</year><volume>10</volume><fpage>911293</fpage><pub-id pub-id-type="doi">10.3389/fped.2022.911293</pub-id><pub-id pub-id-type="medline">36245734</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>PY</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>FY</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>YW</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>TC</given-names> </name></person-group><article-title>Trends and prescription patterns of traditional Chinese medicine use among subjects with allergic diseases: a nationwide population-based study</article-title><source>World Allergy Organ J</source><year>2019</year><volume>12</volume><issue>2</issue><fpage>100001</fpage><pub-id pub-id-type="doi">10.1016/j.waojou.2018.11.001</pub-id><pub-id pub-id-type="medline">30937136</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Rejeleene</surname><given-names>R</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Talburt</surname><given-names>J</given-names> </name></person-group><article-title>Towards trustable language models: investigating information quality of large language models</article-title><source>arXiv</source><comment>Preprint posted online on 2024</comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nat New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>PH</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>WL</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YN</given-names> </name></person-group><article-title>Measuring Taiwanese Mandarin language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 29, 2024</comment><comment>arXiv:2403.20180</comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YN</given-names> </name></person-group><article-title>Taiwan llm: bridging the linguistic divide with a culturally aligned language model</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 29, 2023</comment><comment>arXiv:2311.17487</comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="web"><source>Taiwan Ministry of Examination - Examination Statistics [Website in Mandarin]</source><access-date>2025-01-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://wwwc.moex.gov.tw/main/examreport/wfrmexamstatistics.aspx?menu_id=158">https://wwwc.moex.gov.tw/main/examreport/wfrmexamstatistics.aspx?menu_id=158</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ting</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>YF</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT incorporated chain-of-thought method in bilingual nuclear medicine physician board examinations</article-title><source>Dig Health</source><year>2024</year><volume>10</volume><fpage>20552076231224074</fpage><pub-id pub-id-type="doi">10.1177/20552076231224074</pub-id><pub-id pub-id-type="medline">38188855</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name></person-group><article-title>ChatGPT can pass the AHA exams: open-ended questions outperform multiple-choice format</article-title><source>Resuscitation</source><year>2023</year><month>07</month><volume>188</volume><fpage>109783</fpage><pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109783</pub-id><pub-id pub-id-type="medline">37349064</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>List of the 5 factors, data definitions, and source citations.</p><media xlink:href="mededu_v11i1e58897_app1.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Examples of single-answer multiple-choice and single-answer, multiple-response multiple-choice questions.</p><media xlink:href="mededu_v11i1e58897_app2.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Examples of the prompt used to generate responses from questions.</p><media xlink:href="mededu_v11i1e58897_app3.docx" xlink:title="DOCX File, 57 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Examples of the prompts used to generate responses from questions with explanations for each item.</p><media xlink:href="mededu_v11i1e58897_app4.docx" xlink:title="DOCX File, 179 KB"/></supplementary-material></app-group></back></article>