<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e56762</article-id><article-id pub-id-type="doi">10.2196/56762</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating AI Competence in Specialized Medicine: Comparative Analysis of ChatGPT and Neurologists in a Neurology Specialist Examination in Spain</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ros-Arlanz&#x00F3;n</surname><given-names>Pablo</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Perez-Sempere</surname><given-names>Angel</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Neurology, Dr. Balmis General University Hospital</institution>, <addr-line>C/ Pintor Baeza, N&#x00BA; 11</addr-line><addr-line>Alicante</addr-line>, <country>Spain</country></aff><aff id="aff2"><institution>Department of Neuroscience, Instituto de Investigaci&#x00F3;n Sanitaria y Biom&#x00E9;dica de Alicante</institution>, <addr-line>Alicante</addr-line>, <country>Spain</country></aff><aff id="aff3"><institution>Department of Clinical Medicine, Miguel Hern&#x00E1;ndez University</institution>, <addr-line>Alicante</addr-line>, <country>Spain</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chisci</surname><given-names>Glauco</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chatzimina</surname><given-names>Maria</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Rizvi</surname><given-names>Mohammed</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Pratima</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Pablo Ros-Arlanz&#x00F3;n, MSc, MD, Department of Neurology, Dr. Balmis General University Hospital, C/ Pintor Baeza, N&#x00BA; 11, Alicante, 03010, Spain, 34 965933000; <email>ros_pabarl@gva.es</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>14</day><month>11</month><year>2024</year></pub-date><volume>10</volume><elocation-id>e56762</elocation-id><history><date date-type="received"><day>26</day><month>01</month><year>2024</year></date><date date-type="rev-recd"><day>29</day><month>07</month><year>2024</year></date><date date-type="accepted"><day>07</day><month>10</month><year>2024</year></date></history><copyright-statement>&#x00A9; Pablo Ros-Arlanz&#x00F3;n, Angel Perez-Sempere. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 14.11.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2024/1/e56762"/><abstract><sec><title>Background</title><p>With the rapid advancement of artificial intelligence (AI) in various fields, evaluating its application in specialized medical contexts becomes crucial. ChatGPT, a large language model developed by OpenAI, has shown potential in diverse applications, including medicine.</p></sec><sec><title>Objective</title><p>This study aims to compare the performance of ChatGPT with that of attending neurologists in a real neurology specialist examination conducted in the Valencian Community, Spain, assessing the AI&#x2019;s capabilities and limitations in medical knowledge.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a comparative analysis using the 2022 neurology specialist examination results from 120 neurologists and responses generated by ChatGPT versions 3.5 and 4. The examination consisted of 80 multiple-choice questions, with a focus on clinical neurology and health legislation. Questions were classified according to Bloom&#x2019;s Taxonomy. Statistical analysis of performance, including the &#x03BA; coefficient for response consistency, was performed.</p></sec><sec sec-type="results"><title>Results</title><p>Human participants exhibited a median score of 5.91 (IQR: 4.93-6.76), with 32 neurologists failing to pass. ChatGPT-3.5 ranked 116th out of 122, answering 54.5% of questions correctly (score 3.94). ChatGPT-4 showed marked improvement, ranking 17th with 81.8% of correct answers (score 7.57), surpassing several human specialists. No significant variations were observed in the performance on lower-order questions versus higher-order questions. Additionally, ChatGPT-4 demonstrated increased interrater reliability, as reflected by a higher &#x03BA; coefficient of 0.73, compared to ChatGPT-3.5&#x2019;s coefficient of 0.69.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study underscores the evolving capabilities of AI in medical knowledge assessment, particularly in specialized fields. ChatGPT-4&#x2019;s performance, outperforming the median score of human participants in a rigorous neurology examination, represents a significant milestone in AI development, suggesting its potential as an effective tool in specialized medical education and assessment.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>ChatGPT</kwd><kwd>clinical decision-making</kwd><kwd>medical education</kwd><kwd>medical knowledge assessment</kwd><kwd>OpenAI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Recent advancements in natural language processing, particularly the development of large language models (LLMs), have markedly transformed the capabilities of computational linguistics. Among these, ChatGPT, developed by OpenAI, stands out as a leading example, leveraging advanced deep learning techniques to emulate humanlike text generation. Introduced in late 2022, ChatGPT has quickly gained recognition for its ability to produce coherent and contextually relevant responses, owing to its training on a broad dataset [<xref ref-type="bibr" rid="ref1">1</xref>]. This versatility has made ChatGPT a valuable tool in numerous fields, including medicine.</p><p>In the medical field, ChatGPT&#x2019;s potential has been explored through its application in clinical settings and medical examinations, where it has demonstrated a notable proficiency in addressing complex medical and dental queries [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. This has sparked interest in its role in improving medical education and training and support clinical decision-making.</p><p>In Spain, the process of obtaining a public position as a medical specialist in the public health service involves a competitive examination, which is administered independently across various regions. This is exemplified in the Valencian Community, where the selection of neurology specialists depends on an examination, encompassing both health legislation and clinical neurology questions. The examination is a critical component for securing a position in the public health care system, similar to a civil service examination, and is highly competitive. The candidates are already accredited neurologists with a minimum of 4 years of residency and at least 1 year of professional experience.</p><p>Despite numerous studies examining the performance of ChatGPT in various medical examinations, a significant gap remains in comparing its capabilities with the real performance and results of highly qualified and specialized clinicians in regional specialty examinations. This study specifically addresses this gap by comparing ChatGPT&#x2019;s performance with that of practicing neurologists in the Valencian Community&#x2019;s neurology specialist examination. The primary objective is to evaluate whether ChatGPT can match or surpass human expertise in this context. Additionally, we aim to assess the consistency and improvement in responses between ChatGPT versions 3.5 and 4. Our a priori hypotheses are as follows: (1) ChatGPT-4 will outperform ChatGPT-3.5, demonstrating improved accuracy and reliability, and (2) ChatGPT-4 will perform comparably to human neurologists. This analysis seeks to provide insights into the potential and limitations of artificial intelligence (AI) in specialized medical knowledge assessment and its implications for medical education and practice.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>We conducted a detailed comparative analysis to evaluate the performance of ChatGPT against board-certified neurologists in the 2022 Valencian Community neurology specialist examination [<xref ref-type="bibr" rid="ref10">10</xref>]. This examination is a credentialing examination that grants a job position in the public health system as a neurology specialist within the Valencian Community, rather than a medical licensing examination. Candidates who sit for this examination are already certified neurologists, having completed a minimum of 4 years of residency and at least 1 year of professional experience. Therefore, this examination is more specialized and competitive compared to typical specialty board examinations that grant the initial permission to practice. The 2022 examination employed a multiple-choice format, with 77 out of the original 80 questions considered for scoring, as 3 were invalidated due to errors in question formulation. A total of 120 practicing neurologists took the examination, competing for only 38 available job positions. The results of the individual examinations of each participating neurologist are publicly available on the Department of Health&#x2019;s website [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>The Valencian Health Service is one of the 17 regional health services in Spain, providing universal health care to both residents and travelers in the Valencian Community. This region, located on the eastern Mediterranean coast of Spain, has a population of more than 5.2 million inhabitants and attracts around 28.5 million tourists annually. The scope and geographic reach of the Valencian Health Service include all health care facilities within this region, making the credentialing examination crucial for those seeking to work in these public health care institutions.</p></sec><sec id="s2-2"><title>Multiple-Choice Question Examination</title><p>The examination adopted a scoring system where the maximum attainable score was 10, achievable by correctly answering all questions. Unanswered questions were not penalized. The scoring system penalized wrong answers: for every 3 wrong answers, the score for 1 correct answer was subtracted. Score = (N<sub>correct</sub> &#x2013; 1/3 N<sub>wrong</sub>) &#x00D7; 10/N<sub>total</sub>, where &#x201C;N&#x201D; represents the numbers of correct (N<sub>correct</sub>) and wrong (N<sub>wrong</sub>) answers and the total number of questions (N<sub>total</sub>). The test began with 12 questions on general public and health legislation topics, followed by 65 questions focused on clinical neurology, assessing both theoretical knowledge and clinical reasoning. Participants with a score higher than 4.5 points passed the examination [<xref ref-type="bibr" rid="ref10">10</xref>].</p></sec><sec id="s2-3"><title>Data Collection and Assessment</title><p>We compiled the scores of the 120 participating neurologists, which are publicly available (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). To assess the performance of GPT-3.5 and GPT-4, we used their respective application programming interfaces (APIs). Two independent researchers, PRA and APS, tasked the ChatGPT versions 3.5 and 4 with answering the examination&#x2019;s multiple-choice questions. This study was conducted in December 2023 and used the LLM versions available at that time.</p><sec id="s2-3-1"><title>Prompt Engineering</title><p>For consistency, each version of ChatGPT was given the same set of prompts. The initial prompt provided a brief context of the examination question and instructed the AI to select the best answer (see Supplement 1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-3-2"><title>Interface Version</title><p>We utilized the paid subscription API for both ChatGPT-3.5 and ChatGPT-4, ensuring access to the most advanced features available. The settings used included the default temperature settings to maintain consistency and comparability between responses.</p></sec><sec id="s2-3-3"><title>Language Settings</title><p>Both input and output languages were set to Spanish to match the language of the original examination. This ensured that the AI models processed and responded to the questions in the same language as the neurologists.</p></sec><sec id="s2-3-4"><title>Trial Repetitions</title><p>Each ChatGPT version was tested twice independently to account for any variability in responses. This involved rerunning the entire set of examination questions with the same prompts. For each trial, the responses were recorded and analyzed separately to evaluate consistency and performance.</p></sec><sec id="s2-3-5"><title>Efforts to Chain Prompts</title><p>No prompt chaining was employed in this study. Each question was presented individually, and the AI&#x2019;s responses were based solely on the information provided in the individual prompts.</p></sec><sec id="s2-3-6"><title>Details of Trials</title><p>In total, 4 sets of responses were generated (2 for each version of ChatGPT). Each trial was conducted independently by the researchers to avoid memory bias or influence from previous attempts. The answers were then compiled and compared against the correct answers to calculate the scores.</p></sec></sec><sec id="s2-4"><title>Question Complexity Classification</title><p>Questions in the examination were categorized according to the principles of Bloom&#x2019;s Taxonomy [<xref ref-type="bibr" rid="ref12">12</xref>], a framework for learning and evaluation. This classification differentiated between questions testing lower-order thinking skills, such as recall and basic understanding, and those measuring higher-order thinking skills, such as application, analysis, and evaluation. The classification process involved the following steps. Two independent researchers, PRA and APS, assigned Bloom&#x2019;s Taxonomy classifications to each examination question. To ensure consistency and accuracy in the classification, the initial assignments by both researchers were compared. Any discrepancies in classification were discussed in consensus meetings between the researchers until an agreement was reached. After resolving discrepancies, the final classifications were used in the analysis. These classifications were then used to evaluate the performance of ChatGPT-3.5 and ChatGPT-4 across different levels of cognitive tasks.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>The statistical analysis of the data was conducted using R software, version 4.2.1 (R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>We checked the data&#x2019;s normality using the Kolmogorov-Smirnov test. To assess the consistency of responses within each ChatGPT version across different trials, we calculated the &#x03BA; coefficient for each model. Specifically, we compared the responses given by ChatGPT-3.5 in its two trials and separately compared the responses given by ChatGPT-4 in its two trials. The &#x03BA; coefficient measures the agreement between these two sets of responses, providing an indication of the reliability of the AI&#x2019;s performance across different attempts.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>Members of the Dr. Balmis General University Hospital Ethics Review Board evaluated this project and stated that this committee was not competent to evaluate studies of this type, as they do not encompass human subjects, the use of biological samples, or personal data. Therefore, ethics committee approval was not required for the execution of this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Neurologists&#x2019; Performance</title><p>In the examination under study, 120 neurologists participated. Their median score was 5.91 (IQR: 4.93-6.76) out of 10, with an SD of 1.40. The Kolmogorov-Smirnov test confirmed the normal distribution of these scores. Of these 120 neurologists, 32 did not pass the examination.</p></sec><sec id="s3-2"><title>ChatGPT-3.5 Performance</title><p>ChatGPT-3.5, acting as a hypothetical 121st participant, showed varying results in different attempts. In its first attempt, it answered 41 out of 77 questions correctly, and in another attempt, it managed 42 correct answers. ChatGPT-3.5&#x2019;s scores were 3.77 and 3.94, respectively, in these attempts. However, it failed to reach the examination&#x2019;s passing threshold. Specifically, it answered 32 out of 65 (49.2%) of the clinical neurology and 3.5 out of 12 (29.2%) of the health legislation questions incorrectly, leading to an overall error rate of 35.5 out of 77 (46.1%).</p></sec><sec id="s3-3"><title>ChatGPT-4 Performance</title><p>ChatGPT-4 demonstrated a more robust performance, correctly answering 62 and 63 out of 77 questions, respectively, in both the attempts, achieving a score of 7.57 out of 10 on its best attempt. This score would have qualified it to pass the examination, ranking it 17th out of the 122 candidates (which includes the 120 neurologists and both ChatGPT versions). ChatGPT-4&#x2019;s error rate was 11.05 wrong answers out of 65 (17%) in clinical neurology questions and 3 out of 12 (25%) in legal questions. <xref ref-type="fig" rid="figure1">Figure 1</xref> compares the score distribution of the neurologists who took the examination with the performances of ChatGPT-3.5 and ChatGPT-4.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Distribution of neurologists&#x2019; examination scores. The graph shows the median performance of neurologists and the highest scores of ChatGPT-3.5 and ChatGPT-4 within the overall score distribution.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e56762_fig01.png"/></fig></sec><sec id="s3-4"><title>Concordance Analysis and Complexity-Based Performance</title><p>The &#x03BA; coefficient for ChatGPT-3.5 was 0.686, measuring the consistency of its responses across attempts. ChatGPT-4&#x2019;s &#x03BA; coefficient was slightly higher at 0.725. Both models showed a high level of consistency in their performances across different attempts, with a mere 1.25% variation in their scores. <xref ref-type="table" rid="table1">Table 1</xref> presents the performance data of each model and attempt, broken down by Bloom&#x2019;s Taxonomy question classifications.</p><p>Based on Bloom&#x2019;s Taxonomy, lower-order questions included tasks such as defining terms, recalling facts, and understanding basic concepts (eg, &#x201C;Which lesion causes ideomotor apraxia?&#x201D;). Higher-order questions required application, analysis, and evaluation (eg, &#x201C;Given the following symptoms, what is the most likely diagnosis?&#x201D;).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparative performance analysis of ChatGPT-3.5 and ChatGPT-4 models on the examination: accuracy across attempts and question difficulty levels.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Model and<break/>attempt</td><td align="left" valign="bottom">Overall accuracy (%)</td><td align="left" valign="bottom">Accuracy on lower-order questions (%)</td><td align="left" valign="bottom">Accuracy on higher-order questions (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">ChatGPT-3.5</td></tr><tr><td align="left" valign="top" colspan="2">&#x2003;Attempt 1</td><td align="left" valign="top">53.25</td><td align="left" valign="top">54.84</td><td align="left" valign="top">52.17</td></tr><tr><td align="left" valign="top" colspan="2">&#x2003;Attempt 2</td><td align="left" valign="top">54.55</td><td align="left" valign="top">54.84</td><td align="left" valign="top">54.35</td></tr><tr><td align="left" valign="top" colspan="5">ChatGPT-4</td></tr><tr><td align="left" valign="top" colspan="2">&#x2003;Attempt 1</td><td align="left" valign="top">81.82</td><td align="left" valign="top">77.42</td><td align="left" valign="top">84.78</td></tr><tr><td align="left" valign="top" colspan="2">&#x2003;Attempt 2</td><td align="left" valign="top">80.52</td><td align="left" valign="top">80.65</td><td align="left" valign="top">80.43</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>This study&#x2019;s comparative analysis between ChatGPT and neurologists in a real medical examination offers valuable insights into the current abilities and limitations of AI in the assessment of medical knowledge. We selected ChatGPT, instead of other LLMs such as Gemini or Bard, for our study due to its well-documented performance in medical examinations, robust and user-friendly API facilitating easy integration and comprehensive testing, and its popularity and widespread usage, making it one of the most commonly used LLMs in the world as of December 2023.</p><p>ChatGPT has been able to pass the medical license examinations of several countries such as the United States [<xref ref-type="bibr" rid="ref14">14</xref>], Germany [<xref ref-type="bibr" rid="ref15">15</xref>], China [<xref ref-type="bibr" rid="ref16">16</xref>], Japan [<xref ref-type="bibr" rid="ref7">7</xref>], Saudi Arabia [<xref ref-type="bibr" rid="ref17">17</xref>], Poland [<xref ref-type="bibr" rid="ref18">18</xref>], and Spain [<xref ref-type="bibr" rid="ref19">19</xref>]. Furthermore, ChatGPT has been able to pass the medical examination of a growing list of different medical specialties: anesthesiology [<xref ref-type="bibr" rid="ref20">20</xref>], nuclear medicine [<xref ref-type="bibr" rid="ref21">21</xref>], ophthalmology [<xref ref-type="bibr" rid="ref22">22</xref>], otolaryngology [<xref ref-type="bibr" rid="ref23">23</xref>], radiology [<xref ref-type="bibr" rid="ref24">24</xref>], neurosurgery [<xref ref-type="bibr" rid="ref25">25</xref>], and neurology [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>A key strength of our study is its real-world setting&#x2014;an actual competitive examination undertaken by 120 practicing neurologists, who were competing for specialized positions within the Valencian Health Service. This examination provides a tough and high-pressure assessment of their expertise, reflecting the pressures and complexities encountered in highly specialized and competitive scenarios. The range of scores among the neurologists serves as a human benchmark, highlighting the variability in medical expertise. This variability underlines the dynamic and individual nature of medical knowledge, and provides a realistic benchmark for assessing the capabilities of AI tools such as ChatGPT in professional scenarios. However, the focus on the Valencian Community might limit the generalizability of the findings to other regions or countries.</p><p>ChatGPT-3.5&#x2019;s performance, though notable, reveals complexities. It accurately answered 42 (54.5%) of the questions in its best attempt, surpassing only 6 attending neurologists and failing to pass the examination. If ChatGPT-3.5 were a real examination participant, it would rank 116th out of 122 candidates&#x2014;indicating room for improvement. The disparity in its performance between legal and neurology questions prompts further investigation into its decision-making processes. In contrast, ChatGPT-4&#x2019;s performance shows significant improvement over ChatGPT-3.5. In the demanding neurology specialist examination, ChatGPT-4 not only surpassed its predecessor but also outperformed 103 of 120 human medical specialists. This marks a substantial advance in the model&#x2019;s handling of specialized medical knowledge and suggests its potential as a tool in medical education and decision-making.</p><p>The study design we implemented did not include mechanisms for ChatGPT to explain or reason its answers, which limits our ability to evaluate the types of errors made by the AI models, such as differentiating between content errors and question interpretation errors. We did not prompt ChatGPT to provide explanations for its responses, and thus, we cannot perform a detailed analysis of its reasoning processes. This limitation highlights a gap in our study, as we were unable to analyze the types of errors made by ChatGPT. Future research should incorporate prompts for AI models to explain their answers, which would enable a deeper analysis of content errors versus question interpretation errors.</p><p>We calculated &#x03BA; coefficients to assess the consistency of responses between trials for ChatGPT-3.5 and ChatGPT-4. The &#x03BA; coefficient was 0.686 for ChatGPT-3.5 and 0.725 for ChatGPT-4, both indicating substantial but not perfect agreement. The slightly higher &#x03BA; coefficient for ChatGPT-4 suggests improved reliability; however, the concordance is still not at a level that can be fully trusted without human oversight. This underscores the necessity for clinicians to critically evaluate AI responses and reasoning, reinforcing the principle that &#x201C;two heads are better than one.&#x201D; Future iterations should aim for even higher consistency, particularly in high-stakes fields such as neurology.</p><p>Unlike most existing literature that evaluates AI in English [<xref ref-type="bibr" rid="ref28">28</xref>], our study probes ChatGPT&#x2019;s performance in Spanish, a vital consideration for global medical applications given the variation in medical terminology and nuances across languages. The latest edition of the Cervantes Institute yearbook provides some data that reflect the magnitude of Spanish today [<xref ref-type="bibr" rid="ref29">29</xref>]. It is the fourth most commonly used language globally and the third most widely used language on the internet. Two studies have analyzed the performance of ChatGPT versions 3.5 and 4 in the Spanish examination akin to the United States Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. In the first study, ChatGPT-4 correctly answered 158 out of 182 (86.8%) of the questions, while in the second study, which focused solely on rheumatology questions, it correctly answered 134 out of 143 (93.7%) of the questions. In the first study, questions were prompted in both English and Spanish, with no significant differences observed. These data suggest that the performance of ChatGPT in Spanish in medical examinations is comparable to its performance in English.</p><p>ChatGPT sometimes provides confident answers that are meaningless when considered in the light of common knowledge in these areas. This phenomenon has been described as &#x201C;artificial hallucination&#x201D; [<xref ref-type="bibr" rid="ref31">31</xref>]. This overconfidence was also observed in a neurology board-style examination [<xref ref-type="bibr" rid="ref26">26</xref>] and in our study. Although the prompt for each question stated that &#x201C;The objective is to achieve the maximum score. The score is equal to the number of correct answers minus incorrect answers divided by 3. So, if you are unsure about a question is better not to answer it in order to achieve the maximum possible score,&#x201D; ChatGPT-3.5 and ChatGPT-4 answered all the questions. This behavior, known as &#x201C;artificial hallucination,&#x201D; poses serious risks in medical education, as overconfident yet wrong responses can mislead educators and students, potentially compromising patient safety and care quality. The AI&#x2019;s inability to accurately gauge its confidence level and the appropriateness of not responding raises ethical concerns, especially in high-stakes environments such as neurology where precise knowledge and cautious decision-making are critical. To mitigate these risks, it is crucial to ensure that AI complements rather than replaces human judgment, with safeguards to prevent overreliance on AI. Training AI to recognize its limitations and abstain from responding when uncertain is essential to maintaining the integrity and safety of medical practice.</p><p>In contrast to another study where both models demonstrated weaker performance in tasks requiring higher-order thinking compared with questions requiring only lower-order thinking [<xref ref-type="bibr" rid="ref26">26</xref>], our research revealed that ChatGPT&#x2019;s performance remained consistent across tasks demanding both higher-order and lower-order thinking.</p><p>The ability of AI models, such as ChatGPT, to successfully pass medical examinations raises significant questions about the nature and effectiveness of these examinations. It is not just about what AI can do, but also what these examinations are really testing. This leads us to consider whether these exams accurately measure the real-world skills and knowledge essential for medical professionals. To address this, we propose several key areas of focus:</p><list list-type="order"><list-item><p>Uniquely human skills: More emphasis should be placed on assessing skills unique to human practitioners, such as clinical reasoning (gathering information, developing differential diagnosis, and justifying decision-making process), ethical judgment, and empathetic communication. These are vital yet challenging to quantify aspects of medicine, such as empathy, ethics, and patient-centered care. Developing methods to evaluate these skills could greatly benefit the medical field. Specifically, we propose the use of interactive patient simulations in which candidates must gather information directly from the patient. While current AI models can imitate specialist performance in clinical reasoning and developing differential diagnoses, the information provided to these models should be obtained through interactions with human specialists.</p></list-item><list-item><p>Application in real-world scenarios: Examinations should evolve to test the practical application of medical knowledge in real-life situations. This includes assessing abilities in diagnosis and treatment planning within complex clinical contexts, ensuring that professionals are prepared for real-world challenges. Additionally, allowing the use of LLM interfaces and other search engines during some examinations can simulate real-world conditions where clinicians have access to various technological aids. This approach not only tests their knowledge but also evaluates their critical thinking and ability to effectively search for and apply relevant information. Integrating these technologies into examinations can help improve clinicians&#x2019; performance by fostering skills that are essential in modern medical practice.</p></list-item><list-item><p>Interdisciplinary skills: Given the interdisciplinary nature of modern health care, examinations should also focus on teamwork, collaboration, and communication skills. They should assess the ability of medical professionals to integrate information across various specialties, reflecting the collaborative environment of contemporary health care.</p></list-item><list-item><p>Focus on continual learning: To motivate and teach lifelong learning, we need to shift our focus from merely teaching information retrieval to fostering skills in critical appraisal, problem-solving, and continuous professional development. While GPT can efficiently retrieve information, it is essential for medical professionals to critically appraise and apply this information. Future examinations should include components where candidates review and critique recent research articles, identifying strengths, weaknesses, and the applicability of findings to clinical practice. This ensures clinicians develop the ability to evaluate the quality and relevance of the information they encounter. Additionally, presenting candidates with novel clinical guidelines or emerging evidence in examinations will require them to integrate new information into their practice. This scenario-based assessment evaluates their ability to stay current with ongoing advancements and incorporate new knowledge effectively into clinical decision-making. Emphasizing self-directed learning and the use of various educational resources will help clinicians remain adaptable and proficient throughout their careers.</p></list-item></list><p>In summary, while AI passing medical examinations is an impressive feat, it highlights the need for evolution in medical education and assessment, ensuring that they measure the skills and knowledge that future medical professionals will truly need.</p><sec id="s4-1"><title>Conclusion</title><p>Our study reveals the nuanced interplay between AI and human expertise in neurology, highlighting ChatGPT&#x2019;s potential as a medical knowledge resource. Despite its promising performance, the variability in both AI and human responses calls for a careful, measured integration of AI into medical practice.</p><p>The combination of AI and human expertise could significantly enhance medical education and practice. However, this integration must prioritize patient care and safety, ensuring that AI complements rather than replaces human judgment.</p><p>In summary, this research contributes to the ongoing narrative of AI in health care and sets the stage for further exploration into refining AI for specialized medical uses. The focus remains on harnessing AI to support, not supplant, the invaluable insights of medical professionals.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interfaces</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">USMLE</term><def><p>United States medical licensing examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Introducing ChatGPT</article-title><source>OpenAI</source><year>2023</year><access-date>2024-10-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesko</surname><given-names>B</given-names> </name></person-group><article-title>The ChatGPT (generative artificial intelligence) revolution has made artificial intelligence approachable for medical professionals</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>22</day><volume>25</volume><fpage>e48392</fpage><pub-id pub-id-type="doi">10.2196/48392</pub-id><pub-id pub-id-type="medline">37347508</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Dig Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Hassan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mahmood</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Trialling a large language model (ChatGPT) in general practice with the applied knowledge test: observational study demonstrating opportunities and limitations in primary care</article-title><source>JMIR Med Educ</source><year>2023</year><month>04</month><day>21</day><volume>9</volume><fpage>e46599</fpage><pub-id pub-id-type="doi">10.2196/46599</pub-id><pub-id pub-id-type="medline">37083633</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giannos</surname><given-names>P</given-names> </name><name name-style="western"><surname>Delardas</surname><given-names>O</given-names> </name></person-group><article-title>Performance of ChatGPT on UK standardized admission tests: insights from the BMAT, TMUA, LNAT, and TSA examinations</article-title><source>JMIR Med Educ</source><year>2023</year><month>04</month><day>26</day><volume>9</volume><fpage>e47737</fpage><pub-id pub-id-type="doi">10.2196/47737</pub-id><pub-id pub-id-type="medline">37099373</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giannos</surname><given-names>P</given-names> </name></person-group><article-title>Evaluating the limits of AI in medical specialisation: ChatGPT&#x2019;s performance on the UK neurology specialty certificate examination</article-title><source>BMJ Neurol Open</source><year>2023</year><month>06</month><day>15</day><volume>5</volume><issue>1</issue><fpage>e000451</fpage><pub-id pub-id-type="doi">10.1136/bmjno-2023-000451</pub-id><pub-id pub-id-type="medline">37337531</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Erabi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name></person-group><article-title>Performance of GPT-3.5 and GPT-4 on the Japanese medical licensing examination: comparison study</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>29</day><volume>9</volume><fpage>e48002</fpage><pub-id pub-id-type="doi">10.2196/48002</pub-id><pub-id pub-id-type="medline">37384388</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hatia</surname><given-names>A</given-names> </name><name name-style="western"><surname>Doldo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Parrini</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Accuracy and completeness of ChatGPT-generated information on interceptive orthodontics: a multicenter collaborative study</article-title><source>J Clin Med</source><year>2024</year><month>01</month><day>27</day><volume>13</volume><issue>3</issue><fpage>735</fpage><pub-id pub-id-type="doi">10.3390/jcm13030735</pub-id><pub-id pub-id-type="medline">38337430</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frosolini</surname><given-names>A</given-names> </name><name name-style="western"><surname>Franz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Benedetti</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Assessing the accuracy of ChatGPT references in head and neck and ENT disciplines</article-title><source>Eur Arch Otorhinolaryngol</source><year>2023</year><month>11</month><volume>280</volume><issue>11</issue><fpage>5129</fpage><lpage>5133</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08205-4</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>Diario oficial de la generalitat valenciana [Article in Spanish]</article-title><source>Generalitat Valenciana</source><year>2020</year><access-date>2024-10-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://dogv.gva.es/datos/2020/11/04/pdf/2020_8784.pdf">https://dogv.gva.es/datos/2020/11/04/pdf/2020_8784.pdf</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Lista aprobados (fase oposici&#x00F3;n) [Article in Spanish]</article-title><source>Generalitat Valenciana</source><access-date>2024-10-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gva.es/downloads/publicados/EP/54_FE_NEUROLOGIA_RES_NOTAS_DEF_casval_firmado.pdf">https://www.gva.es/downloads/publicados/EP/54_FE_NEUROLOGIA_RES_NOTAS_DEF_casval_firmado.pdf</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sawin</surname><given-names>EI</given-names> </name></person-group><article-title>Taxonomy of educational objectives: the classification of educational goals. Handbook 1. Committee of College and University Examiners, Benjamin S. Bloom</article-title><source>Elem Sch J</source><year>1957</year><month>03</month><volume>57</volume><issue>6</issue><fpage>343</fpage><lpage>344</lpage><pub-id pub-id-type="doi">10.1086/459563</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>R Core Team</collab></person-group><source>R: a language and environment for statistical computing</source><year>2022</year><access-date>2024-10-23</access-date><publisher-name>R Foundation for Statistical Computing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mihalache</surname><given-names>A</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Popovic</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Muni</surname><given-names>RH</given-names> </name></person-group><article-title>ChatGPT-4: an assessment of an upgraded artificial intelligence chatbot in the United States Medical Licensing Examination</article-title><source>Med Teach</source><year>2024</year><month>03</month><day>3</day><volume>46</volume><issue>3</issue><fpage>366</fpage><lpage>372</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2023.2249588</pub-id><pub-id pub-id-type="medline">37839017</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>LB</given-names> </name><name name-style="western"><surname>Gudera</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Wiegand</surname><given-names>TLT</given-names> </name><name name-style="western"><surname>Allmendinger</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dimitriadis</surname><given-names>K</given-names> </name><name name-style="western"><surname>Koerte</surname><given-names>IK</given-names> </name></person-group><article-title>ChatGPT passes German state examination in medicine with picture questions omitted</article-title><source>Dtsch Arztebl Int</source><year>2023</year><month>05</month><day>30</day><volume>120</volume><issue>21</issue><fpage>373</fpage><lpage>374</lpage><pub-id pub-id-type="doi">10.3238/arztebl.m2023.0113</pub-id><pub-id pub-id-type="medline">37530052</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>W</given-names> </name><etal/></person-group><article-title>How does ChatGPT-4 preform on non-English national medical licensing examination? An evaluation in Chinese language</article-title><source>PLOS Digit Health</source><year>2023</year><month>12</month><volume>2</volume><issue>12</issue><fpage>e0000397</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000397</pub-id><pub-id pub-id-type="medline">38039286</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aljindan</surname><given-names>FK</given-names> </name><name name-style="western"><surname>Al Qurashi</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Albalawi</surname><given-names>IAS</given-names> </name><etal/></person-group><article-title>ChatGPT conquers the Saudi medical licensing exam: exploring the accuracy of artificial intelligence in medical knowledge assessment and implications for modern medical education</article-title><source>Cureus</source><year>2023</year><month>09</month><volume>15</volume><issue>9</issue><fpage>e45043</fpage><pub-id pub-id-type="doi">10.7759/cureus.45043</pub-id><pub-id pub-id-type="medline">37829968</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roso&#x0142;</surname><given-names>M</given-names> </name><name name-style="western"><surname>G&#x0105;sior</surname><given-names>JS</given-names> </name><name name-style="western"><surname>&#x0141;aba</surname><given-names>J</given-names> </name><name name-style="western"><surname>Korzeniewski</surname><given-names>K</given-names> </name><name name-style="western"><surname>M&#x0142;y&#x0144;czak</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish medical final examination</article-title><source>Sci Rep</source><year>2023</year><month>11</month><day>22</day><volume>13</volume><issue>1</issue><fpage>20512</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-46995-z</pub-id><pub-id pub-id-type="medline">37993519</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guillen-Grima</surname><given-names>F</given-names> </name><name name-style="western"><surname>Guillen-Aguinaga</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guillen-Aguinaga</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating the efficacy of ChatGPT in navigating the Spanish medical residency entrance examination (MIR): promising horizons for AI in clinical medicine</article-title><source>Clin Pract</source><year>2023</year><month>11</month><day>20</day><volume>13</volume><issue>6</issue><fpage>1460</fpage><lpage>1487</lpage><pub-id pub-id-type="doi">10.3390/clinpract13060130</pub-id><pub-id pub-id-type="medline">37987431</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shay</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>B</given-names> </name><name name-style="western"><surname>Redaelli</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Could ChatGPT-4 pass an anaesthesiology board examination? Follow-up assessment of a comprehensive set of board examination practice questions</article-title><source>Br J Anaesth</source><year>2024</year><month>01</month><volume>132</volume><issue>1</issue><fpage>172</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.1016/j.bja.2023.10.025</pub-id><pub-id pub-id-type="medline">37996275</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ting</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>YF</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT incorporated chain-of-thought method in bilingual nuclear medicine physician board examinations</article-title><source>Dig Health</source><year>2024</year><month>01</month><day>5</day><volume>10</volume><fpage>20552076231224074</fpage><pub-id pub-id-type="doi">10.1177/20552076231224074</pub-id><pub-id pub-id-type="medline">38188855</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sakai</surname><given-names>D</given-names> </name><name name-style="western"><surname>Maeda</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ozaki</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kanda</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Kurimoto</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Takahashi</surname><given-names>M</given-names> </name></person-group><article-title>Performance of ChatGPT in board examinations for specialists in the Japanese ophthalmology society</article-title><source>Cureus</source><year>2023</year><month>12</month><volume>15</volume><issue>12</issue><fpage>e49903</fpage><pub-id pub-id-type="doi">10.7759/cureus.49903</pub-id><pub-id pub-id-type="medline">38174202</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Revercomb</surname><given-names>L</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Choudhry</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Filimonov</surname><given-names>A</given-names> </name></person-group><article-title>Performance of ChatGPT in otolaryngology knowledge assessment</article-title><source>Am J Otolaryngol</source><year>2024</year><volume>45</volume><issue>1</issue><fpage>104082</fpage><pub-id pub-id-type="doi">10.1016/j.amjoto.2023.104082</pub-id><pub-id pub-id-type="medline">37862879</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ariyaratne</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jenko</surname><given-names>N</given-names> </name><name name-style="western"><surname>Mark Davies</surname><given-names>A</given-names> </name><name name-style="western"><surname>Iyengar</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Botchu</surname><given-names>R</given-names> </name></person-group><article-title>Could ChatGPT pass the UK radiology fellowship examinations?</article-title><source>Acad Radiol</source><year>2024</year><month>05</month><volume>31</volume><issue>5</issue><fpage>2178</fpage><lpage>2182</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2023.11.026</pub-id><pub-id pub-id-type="medline">38160089</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT and GPT-4 on neurosurgery written board examinations</article-title><source>Neurosurgery</source><year>2023</year><month>12</month><day>1</day><volume>93</volume><issue>6</issue><fpage>1353</fpage><lpage>1365</lpage><pub-id pub-id-type="doi">10.1227/neu.0000000000002632</pub-id><pub-id pub-id-type="medline">37581444</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schubert</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Wick</surname><given-names>W</given-names> </name><name name-style="western"><surname>Venkataramani</surname><given-names>V</given-names> </name></person-group><article-title>Performance of large language models on a neurology board-style examination</article-title><source>JAMA Netw Open</source><year>2023</year><month>12</month><day>1</day><volume>6</volume><issue>12</issue><fpage>e2346721</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.46721</pub-id><pub-id pub-id-type="medline">38060223</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Multala</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kearns</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Assessment of ChatGPT&#x2019;s performance on neurology written board examination questions</article-title><source>BMJ Neurol Open</source><year>2023</year><month>11</month><day>2</day><volume>5</volume><issue>2</issue><fpage>e000530</fpage><pub-id pub-id-type="doi">10.1136/bmjno-2023-000530</pub-id><pub-id pub-id-type="medline">37936648</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Seghier</surname><given-names>ML</given-names> </name></person-group><article-title>ChatGPT: not all languages are equal</article-title><source>Nature New Biol</source><year>2023</year><month>03</month><day>9</day><volume>615</volume><issue>7951</issue><fpage>216</fpage><lpage>216</lpage><pub-id pub-id-type="doi">10.1038/d41586-023-00680-3</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>El espa&#x00F1;ol: una lengua viva informe [Article in Spanish]</article-title><source>Centro Virtual Cervantes</source><year>2023</year><access-date>2024-10-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cvc.cervantes.es/lengua/anuario/anuario_23/informes_ic/p01.htm">https://cvc.cervantes.es/lengua/anuario/anuario_23/informes_ic/p01.htm</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madrid-Garc&#x00ED;a</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rosales-Rosado</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Freites-Nu&#x00F1;ez</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Harnessing ChatGPT and GPT-4 for evaluating the rheumatology questions of the Spanish access exam to specialized medical training</article-title><source>Sci Rep</source><year>2023</year><month>12</month><day>13</day><volume>13</volume><issue>1</issue><fpage>22129</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-49483-6</pub-id><pub-id pub-id-type="medline">38092821</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkaissi</surname><given-names>H</given-names> </name><name name-style="western"><surname>McFarlane</surname><given-names>SI</given-names> </name></person-group><article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title><source>Cureus</source><year>2023</year><month>02</month><volume>15</volume><issue>2</issue><fpage>e35179</fpage><pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id><pub-id pub-id-type="medline">36811129</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Initial prompt for each question and scores of the 120 participating neurologists.</p><media xlink:href="mededu_v10i1e56762_app1.docx" xlink:title="DOCX File, 25 KB"/></supplementary-material></app-group></back></article>