<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e75452</article-id><article-id pub-id-type="doi">10.2196/75452</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>GPT-4o and OpenAI o1 Performance on the 2024 Spanish Competitive Medical Specialty Access Examination: Cross-Sectional Quantitative Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Benito</surname><given-names>Pau</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Isla-Jover</surname><given-names>Mikel</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gonz&#x00E1;lez-Castro</surname><given-names>Pablo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fern&#x00E1;ndez Esparcia</surname><given-names>Pedro Jos&#x00E9;</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Carpio</surname><given-names>Manuel</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Blay-Sim&#x00F3;n</surname><given-names>Iv&#x00E1;n</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Guti&#x00E9;rrez-Bedia</surname><given-names>Pablo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lapastora</surname><given-names>Maria J</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Carratal&#x00E1;</surname><given-names>Beatriz</given-names></name><degrees>LLB, MEd</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Carazo-Casas</surname><given-names>Carlos</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff10">10</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Preventive Medicine and Epidemiology, Clinical Institute of Medicine and Dermatology (ICMiD), Hospital Cl&#x00ED;nic de Barcelona</institution><addr-line>Rossell&#x00F3;, 138, ground floor</addr-line><addr-line>Barcelona</addr-line><country>Spain</country></aff><aff id="aff2"><institution>Department of Radiology, Hospital de Cruces</institution><addr-line>Barakaldo</addr-line><country>Spain</country></aff><aff id="aff3"><institution>Department of Plastic and Reconstructive Surgery, Hospital Universitario Virgen del Roc&#x00ED;o</institution><addr-line>Sevilla</addr-line><country>Spain</country></aff><aff id="aff4"><institution>Department of Dermatology, Hospital Universitario Ram&#x00F3;n y Cajal</institution><addr-line>Madrid</addr-line><country>Spain</country></aff><aff id="aff5"><institution>Department of Endocrinology and Nutrition, Santa Luc&#x00ED;a University General Hospital</institution><addr-line>Cartagena</addr-line><country>Spain</country></aff><aff id="aff6"><institution>Department of Dermatology, Hospital Universitario Doctor Peset</institution><addr-line>Valencia</addr-line><country>Spain</country></aff><aff id="aff7"><institution>Department of Neurology, Hospital Cl&#x00ED;nico San Carlos</institution><addr-line>Madrid</addr-line><country>Spain</country></aff><aff id="aff8"><institution>Department of Intensive Care Medicine, Hospital Universitario 12 De Octubre</institution><addr-line>Madrid</addr-line><country>Spain</country></aff><aff id="aff9"><institution>Innovation and Digital Projects Academic Department, Healthcademia</institution><addr-line>Madrid</addr-line><country>Spain</country></aff><aff id="aff10"><institution>Department of Otolaryngology, Hospital Universitario Ram&#x00F3;n y Cajal</institution><addr-line>Madrid</addr-line><country>Spain</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Pellegrino</surname><given-names>Raffaele</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Watari</surname><given-names>Takashi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Pau Benito, MD, Department of Preventive Medicine and Epidemiology, Clinical Institute of Medicine and Dermatology (ICMiD), Hospital Cl&#x00ED;nic de Barcelona, Rossell&#x00F3;, 138, ground floor, Barcelona, 08036, Spain, 34 932 27 54 00 ext 4046; <email>pabenito@clinic.cat</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>12</day><month>1</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e75452</elocation-id><history><date date-type="received"><day>03</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>11</month><year>2025</year></date><date date-type="accepted"><day>25</day><month>11</month><year>2025</year></date></history><copyright-statement>&#x00A9; Pau Benito, Mikel Isla-Jover, Pablo Gonz&#x00E1;lez-Castro, Pedro Jos&#x00E9; Fern&#x00E1;ndez Esparcia, Manuel Carpio, Iv&#x00E1;n Blay-Sim&#x00F3;n, Pablo Guti&#x00E9;rrez-Bedia, Maria J Lapastora, Beatriz Carratal&#x00E1;, Carlos Carazo-Casas. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 12.1.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2026/1/e75452"/><abstract><sec><title>Background</title><p>In recent years, generative artificial intelligence and large language models (LLMs) have rapidly advanced, offering significant potential to transform medical education. Several studies have evaluated the performance of chatbots on multiple-choice medical examinations.</p></sec><sec><title>Objective</title><p>The study aims to assess the performance of two LLMs&#x2014;GPT-4o and OpenAI o1&#x2014;on the <italic>M&#x00E9;dico Interno Residente</italic> (MIR) 2024 examination, the Spanish national medical test that determines eligibility for competitive medical specialist training positions.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 176 questions from the MIR 2024 examination were analyzed. Each question was presented individually to the chatbots to ensure independence and prevent memory retention bias. No additional prompts were introduced to minimize potential bias. For each LLM, response consistency under verification prompting was assessed by systematically asking, &#x201C;Are you sure?&#x201D; after each response. Accuracy was defined as the percentage of correct responses compared to the official answers provided by the Spanish Ministry of Health. It was assessed for GPT-4o, OpenAI o1, and, as a benchmark, for a consensus of medical specialists and for the average MIR candidate. Subanalyses included performance across different medical subjects, question difficulty (quintiles based on the percentage of examinees correctly answering each question), and question types (clinical cases vs theoretical questions; positive vs negative questions).</p></sec><sec sec-type="results"><title>Results</title><p>Overall accuracy was 89.8% (158/176) for GPT-4o and 90% (160/176) after verification prompting, 92.6% (163/176) for OpenAI o1 and 93.2% (164/176) after verification prompting, 94.3% (166/176) for the consensus of medical specialists, and 56.6% (100/176) for the average MIR candidate. Both LLMs and the consensus of medical specialists outperformed the average MIR candidate across all 20 medical subjects analyzed, with &#x2265;80% LLMs&#x2019; accuracy in most domains. A performance gradient was observed: LLMs&#x2019; accuracy gradually declined as question difficulty increased. Slightly higher accuracy was observed for clinical cases compared to theoretical questions, as well as for positive questions compared to negative ones. Both models demonstrated high response consistency, with near-perfect agreement between initial responses and those after the verification prompting.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These findings highlight the excellent performance of GPT-4o and OpenAI o1 on the MIR 2024 examination, demonstrating consistent accuracy across medical subjects and question types. The integration of LLMs into medical education presents promising opportunities and is likely to reshape how students prepare for licensing examinations and change our understanding of medical education. Further research should explore how the wording, language, prompting techniques, and image-based questions can influence LLMs&#x2019; accuracy, as well as evaluate the performance of emerging artificial intelligence models in similar assessments.</p></sec></abstract><kwd-group><kwd>accuracy</kwd><kwd>artificial intelligence</kwd><kwd>GPT-4o</kwd><kwd>large language models</kwd><kwd>medical education</kwd><kwd>medical examination</kwd><kwd>M&#x00E9;dico Interno Residente</kwd><kwd>MIR 2024 examination</kwd><kwd>OpenAI o1</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The continuous developments in recent years have positioned generative artificial intelligence (AI) as a topic of paramount public and scientific interest. These developments have resulted in the creation of gradually more sophisticated and efficient large language models (LLMs) [<xref ref-type="bibr" rid="ref1">1</xref>].</p><p>Some of the most prominent examples are the increasingly advanced models derived from the GPT family, developed by OpenAI, which rely on deep neural networks [<xref ref-type="bibr" rid="ref2">2</xref>]. In 2024, OpenAI released 2 highly promising models. Out of which one was GPT-4o, launched in May 2024, a multimodal model capable of processing text and image inputs and generating text outputs in real time. GPT-4o stands out in terms of rapid response times and efficiency [<xref ref-type="bibr" rid="ref3">3</xref>]. The other one was OpenAI o1, launched in September 2024, a model only capable of processing and generating text, but trained with large-scale reinforcement learning (RL) to reason using chain of thought (CoT) and so possessing advanced reasoning capabilities, surpassing GPT-4o in competitive programming, mathematics, and scientific reasoning [<xref ref-type="bibr" rid="ref4">4</xref>]. Despite the absence of comprehensive benchmark sets providing consistent evidence, it is reasonable to expect that the presence of sophisticated built-in reasoning-optimized mechanisms in LLMs such as OpenAI o1&#x2014;trained with RL and CoT&#x2014;diminishes the relative impact of complex prompting strategies. In such cases, simple zero-shot prompting may prove more effective, or at least equally effective, compared to few-shot and chain-of-thought prompting [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Chatbots for daily use have emerged to provide virtual assistance, personalized solutions, and task automation in a wide range of fields, including medical education, which has also embraced this trend [<xref ref-type="bibr" rid="ref6">6</xref>]. Chatbots can be used as a learning aid to improve clinical skills at the undergraduate, residency training, and postgraduate levels of continuous medical education [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>There are several previous experiences evaluating the performance of chatbots answering multiple-choice questions [<xref ref-type="bibr" rid="ref7">7</xref>], including medical board examinations like the United States Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. These assessments are helpful to understand the state of the art regarding LLMs&#x2019; performance in medical examinations. Furthermore, they pose questions about and provide insightful information to shape the content and characteristics of medical education and examinations.</p><p>In Spain, similarly to the USMLE, doctors are examined prior to the beginning of their specialized training. The test is called the <italic>M&#x00E9;dico Interno Residente</italic> (MIR) examination and consists of a 4.5-hour-long examination that includes 210 multiple-choice questions with 4 options and only 1 correct answer. It is held on a yearly basis. The examination serves a double purpose. On the one hand, it is used to rank physicians to assess their eligibility for competitive medical specialist training positions. On the other hand, it ensures minimum requirements are met among candidates.</p><p>Evidence suggests a strong correlation between LLM performance across different input languages and the representativeness of each language in the pre-training corpus, a relationship that extends to retrieval-augmented generation LLMs [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. To our knowledge, no study has evaluated GPT-4o&#x2019;s performance on the Spanish MIR examination to date and, more importantly, no study has compared it to OpenAI o1, which owns enhanced reasoning capabilities that could potentially be an advantage when taking the MIR examination [<xref ref-type="bibr" rid="ref4">4</xref>]. In a broader sense, there are few published studies that have evaluated the performance of LLMs when responding to medical questions in the Spanish language. Among them, the study by Guillen-Grima et al [<xref ref-type="bibr" rid="ref13">13</xref>] reported a remarkable accuracy rate of 87% for GPT-4 on the 2022 MIR examination, while the study by Flores-Cohaila et al [<xref ref-type="bibr" rid="ref14">14</xref>] showed an accuracy rate of 86% for GPT-4 on the 2022 Peruvian National Licensing Medical Examination. Other studies posing questions in Spanish from specific medical subjects have shown similar results, with performance rates of 83.7% for GPT-4o in anesthesiology [<xref ref-type="bibr" rid="ref15">15</xref>] and 93.7% for GPT-4 in rheumatology [<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>The primary aim of this study is to assess the performance of GPT-4o and OpenAI o1 LLMs in passing the MIR examination and to compare them with the expert consensus from instructors of one of the largest MIR preparation academies (Academia AMIR) and the students&#x2019; mean results. The secondary aim of this study is to compare the performance of GPT-4o, OpenAI o1, expert consensus from AMIR instructors and students by medical subjects, question difficulty, and type of question (clinical case vs theoretical question and positive vs negative question) to better characterize AI chatbots&#x2019; capabilities, limitations, strengths, and weaknesses.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This is a cross-sectional study assessing the performance of 2 LLMs (GPT-4o and OpenAI o1) in answering the MIR 2024 examination questions. The study compares the models&#x2019; performance against each other and against specifically trained humans (expert consensus from AMIR instructors and the mean results from MIR 2024 examination candidates).</p></sec><sec id="s2-2"><title>MIR 2024 Examination</title><p>In Spain, there are 46 medical specialties, each requiring a specific training period of 4 to 5 years as a resident physician (MIR) in an accredited health care institution. Access to each specialty training spot depends on the national ranking of candidates. The ranking is based on a final grade which comes from the MIR examination score (90%) and the candidate&#x2019;s academic record (10%) [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>A total of 15,114 candidates were admitted to the MIR 2024 examination, of whom 13,711 sat for the test, competing for 9007 specialty positions available in accredited healthcare institutions across Spain [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>The examination, held on January 25, 2025, consisted of 200 multiple-choice questions, each with 4 answer choices, with only 1 correct option. The first 25 questions included linked images that were part of the questions&#x2019; content and could help or be necessary to answer them. Additionally, 10 reserve questions were included to replace any disputed questions due to typographical errors, ambiguous wording, or issues with multiple or missing correct answers. Participants were given 4 hours and 30 minutes to complete the examination [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>As a safeguard against academic misconduct (ie, cheating), the MIR examination is administered in several different versions each year. Each version comprises an identical question set with a varied sequence. Version 0 is established as the canonical version for scoring and for the publication of the official answer key.</p><p>Version 0 of the MIR 2024 examination was obtained from the Spanish Ministry of Health website [<xref ref-type="bibr" rid="ref19">19</xref>]. In the final analysis, the 25 questions requiring image interpretation were excluded since one of the LLMs evaluated (OpenAI o1) does not accept image inputs. This decision aimed to ensure fair comparability across all study arms, as providing images only to human participants and the other LLM (GPT-4o) would have introduced a systematic advantage for them. The 5 questions whose objections were accepted by the Spanish Ministry of Health were also excluded from the final analysis in order to approximate the real examination as closely as possible (the Spanish Ministry of Health accepted the objection for 6 questions but one of them was linked to an image, so it was already eliminated from the analysis). Nonetheless, the performance of the 4 study arms on these questions is also reported and discussed later in the study. Of the reserve questions, the 4 questions that did not replace any challenged question were also discarded from the analysis. Final analysis included 176 questions.</p></sec><sec id="s2-3"><title>Study Arms</title><p>This study compares the 4 distinct arms as follows: GPT-4o, OpenAI o1, expert consensus from AMIR instructors (henceforth &#x201C;AMIR consensus&#x201D;), and mean results of the MIR 2024 examination candidates (henceforth &#x201C;students&#x201D;).</p><sec id="s2-3-1"><title>GPT-4o</title><p>This model, OpenAI&#x2019;s flagship in 2024, is characterized by rapid response times and efficiency. Although it is a multimodal model, in this study, only its text processing and generation capabilities were used. Image-linked questions were excluded to ensure comparability with OpenAI o1 model.</p></sec><sec id="s2-3-2"><title>OpenAI o1</title><p>Designed to use large-scale RL and CoT reasoning, this model exhibits advanced reasoning capabilities, which could be particularly useful for answering questions in the MIR examination.</p></sec><sec id="s2-3-3"><title>AMIR Consensus</title><p>Academia AMIR is a private, for-profit educational company operating in Spain, Portugal, and several Latin American countries, providing postgraduate health sciences training. Its core activities include preparing candidates for official examinations such as the MIR examination. The company employs faculty members who deliver these courses but are independent from the MIR examination process. They neither contribute to the examination&#x2019;s development nor belong to any public organization involved in its preparation. A panel of these faculty&#x2014;at least 2 per medical specialty&#x2014;collaboratively answered the entire MIR 2024 examination after its administration and official content release by the Spanish Ministry of Health. Meeting in a hybrid format (combining in-person and remote participation), they established a consensus through discussion answering within 4.5 hours, mirroring the time allotted to candidates. The goal was to give candidates prompt performance feedback before the official answer key was published. Therefore, the faculty had unrestricted access to textbooks, scientific literature, and LLMs, though reported use was minimal and reserved for clarifying ambiguous questions prior to group consensus. This process produced an expert consensus answer list for the MIR 2024 examination.</p></sec><sec id="s2-3-4"><title>Students</title><p>Candidates who took the examination were encouraged to submit their answer templates to EstimAMIR, an online platform developed by Academia AMIR. This platform provides students with a preliminary assessment of their results, initially using &#x201C;AMIR consensus&#x201D; answers and later incorporating the provisional and definitive correct answers published by the Spanish Ministry of Health. The platform also estimated each student&#x2019;s ranking position based on the sample of candidates available. The mean results of the &#x201C;students&#x201D;, as well as the percentage of correct answers for each question, were obtained from this platform (based on 5066 answer templates submitted). All data were appropriately anonymized and aggregated.</p></sec></sec><sec id="s2-4"><title>Data Collection</title><p>The 185 text-based questions from the MIR 2024 examination were collected and transcribed verbatim in Spanish into the dialogue interface of both GPT-4o and OpenAI o1. A ChatGPT Plus license was used to access the GPT-4o and OpenAI o1. The models were used with their default settings, with no modifications to parameters such as temperature or output variation. Each multiple-choice question was followed by the 4 possible answer choices (1, 2, 3, and 4), which were manually entered and separated by single spaces. No pretraining or standardized instructions were provided, adhering strictly to a zero-shot prompting approach to minimize potential bias. Henceforth, the results generated using this prompt will be designated as the first iteration results.</p><p>Questions were presented to the chatbots individually, with a new dialogue initiated for each question to ensure independence and prevent memory retention bias. To assess response consistency, chatbots were systematically challenged with the verification prompt &#x201C;Are you sure?&#x201D; after each answer, which served as a single CoT prompt. Hereafter, we referred to the results obtained with this prompt as the second iteration results. For GPT-4o, internet access was disabled during testing.</p><p>All responses were recorded in a spreadsheet. Once the definitive official answers were published by the Spanish Ministry of Health, any challenged or unused reserve questions were excluded from the final analysis.</p></sec><sec id="s2-5"><title>Main Endpoint and Additional Analysis</title><sec id="s2-5-1"><title>Overview</title><p>The primary endpoint was the percentage of correct answers per study arm. The definitive official answers published by the Spanish Ministry of Health served as the gold standard for determining accuracy within each study arm. The secondary endpoints included comparisons of study arm performance based on medical subjects, question difficulty, and question type (clinical case vs theoretical question and positive vs negative question). Categorization was conducted as described below.</p></sec><sec id="s2-5-2"><title>Medical Subject</title><p>Questions were classified into the following categories such as gastroenterology and general surgery, endocrinology, infectious diseases and microbiology, miscellaneous and basic sciences, neurology and neurosurgery, cardiology and cardiovascular surgery, gynecology and obstetrics, orthopedic surgery, pediatrics, nephrology, respiratory medicine and thoracic surgery, rheumatology, hematology, psychiatry, immunology, urology, dermatology, ophthalmology, otorhinolaryngology, and statistics and epidemiology.</p></sec><sec id="s2-5-3"><title>Question Difficulty</title><p>Difficulty was categorized based on the percentage of examinees who correctly answered each question, using data from EstimAMIR (very difficult: 0%&#x2010;20% correct responses; difficult: 21%&#x2010;40% correct responses; intermediate: 41%&#x2010;60% correct responses; easy: 61%&#x2010;80% correct responses; very easy: 81%&#x2010;100% correct responses).</p></sec><sec id="s2-5-4"><title>Theoretical Question Versus Clinical Case</title><p>Questions were classified as theoretical (requiring a direct answer based exclusively on theoretical knowledge) and clinical case (presenting a clinical scenario from which the possible answers emerged).</p></sec><sec id="s2-5-5"><title>Positive Versus Negative Questions</title><p>Questions were classified based on whether they asked for the correct answer (or the next appropriate step) or the incorrect answer (or the step that should not be taken).</p></sec></sec><sec id="s2-6"><title>Statistical Analysis</title><p>Comparisons between study arms were performed using chi-squared tests or the Fisher exact test, where applicable. The Benjamini-Hochberg method was applied to statistically adjust for multiple comparisons. Differences between groups were considered statistically significant if <italic>P</italic>&#x003C;.05. We assessed the consistency of responses from GPT-4o and OpenAI o1 to a verification prompt (&#x201C;Are you sure?&#x201D;). Consistency was measured using both the simple agreement percentage and the Cohen &#x03BA; coefficient between the answers provided before and after the prompt. All statistical analyses were performed using <italic>R</italic> version 4.4.1 (R Foundation for Statistical Computing).</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>In the absence of a formal ethics committee at Academia AMIR, an ad hoc data ethical oversight panel approved a comprehensive data use protocol (internal reference: AMIR-ETH-2025-11-05-v1.0). At enrollment, all students provided data-use consent, acknowledging that their irreversibly anonymized and aggregated data could be used by Academia AMIR for statistical, commercial, educational, research, and product improvement purposes. The panel determined that the study qualified for exemption from institutional review board approval, as it involved secondary data that were aggregated, processed without human intervention, and contained no identifiable information, in accordance with principles of risk proportionality and data minimization. Participant privacy and confidentiality were safeguarded through irreversible anonymization and aggregation prior to investigator access, data minimization, role-based access controls, and encryption of data both in transit and at rest within corporate repositories. No participants received compensation for their participation in the study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Global Performance</title><p>A flowchart of the exclusion criteria for the question selection process is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the exclusion criteria for the question selection process.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75452_fig01.png"/></fig><p><xref ref-type="table" rid="table1">Table 1</xref> presents the accuracy of GPT-4o, OpenAI o1, the AMIR consensus, and students when taking the entire examination, excluding the discarded questions. GPT-4o achieved an accuracy of 89.8% (158/176) in the first iteration, which slightly increased to 90.9% (160/176) in the second iteration. Similarly, OpenAI o1&#x2019;s accuracy was 92.6% (163/176) in the first iteration and improved to 93.2% (164/176) in the second iteration. The AMIR instructors&#x2019; consensus obtained the highest score among the study arms, with 94.3% (166/176). The mean score of the EstimAMIR-submitted templates was 56.6% (100/176).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Global performance of GPT-4o (first and second iteration), OpenAI o1 (first and second iteration), AMIR consensus, and students.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Absolute and relative performance</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct answers, n (%)</td><td align="left" valign="top">Incorrect answers, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">GPT-4o</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>First iteration</td><td align="left" valign="top">158 (89.8)</td><td align="left" valign="top">18 (10.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Second iteration</td><td align="left" valign="top">160 (90.9)</td><td align="left" valign="top">16 (9.1)</td></tr><tr><td align="left" valign="top" colspan="3">OpenAI o1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>First iteration</td><td align="left" valign="top">163 (92.6)</td><td align="left" valign="top">13 (7.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Second iteration</td><td align="left" valign="top">164 (93.2)</td><td align="left" valign="top">12 (6.8)</td></tr><tr><td align="left" valign="top">AMIR consensus</td><td align="left" valign="top">166 (94.3)</td><td align="left" valign="top">10 (5.7)</td></tr><tr><td align="left" valign="top">Students</td><td align="left" valign="top">100 (56.6)</td><td align="left" valign="top">76 (43.4)</td></tr></tbody></table></table-wrap><p><xref ref-type="fig" rid="figure2">Figure 2</xref> compares the accuracy of GPT-4o and OpenAI o1 in their second iterations, along with the AMIR consensus and students. GPT-4o, OpenAI o1, and the AMIR consensus achieved significantly higher accuracy scores than the average student (in all cases <italic>P</italic>&#x003C;.001); however, differences between these 3 arms did not reach statistical significance (<italic>P</italic>=.22 for GPT-4o vs OpenAI o1; <italic>P</italic>=.07 for GPT-4o vs AMIR consensus; <italic>P</italic>=.75 for OpenAI o1 vs AMIR consensus).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Global performance of GPT-4o (2nd iteration), OpenAI o1 (2nd iteration), AMIR consensus, and students.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75452_fig02.png"/></fig><p>From the final analysis, 5 challenged questions were excluded. On these items, GPT-4o achieved an accuracy of 100% (5/5) across both iterations. OpenAI o1 scored 80% (4/5) in both iterations, without modifying any of its responses. The AMIR consensus achieved an accuracy of 40% (2/5). When restricted to these 5 questions, the students&#x2019; mean score was 31.3% (SD 16.7%).</p></sec><sec id="s3-2"><title>Medical Subjects</title><p>The heatmap in <xref ref-type="fig" rid="figure3">Figure 3</xref> presents a comparative analysis of study arm performance across different medical subjects. Both LLMs (GPT-4o and OpenAI o1, in their second iterations) and the AMIR consensus outperformed the average student in all 20 medical subjects analyzed.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Heatmap comparing the performance of generative pre-trained transformer 4o (2nd iteration), OpenAI o1 (2nd iteration), and AMIR consensus and students by medical subject.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75452_fig03.png"/></fig><p>GPT-4o achieved an accuracy below 80% in only 3 subjects: infectious diseases and microbiology (11/14, 79%), pediatrics (7/10, 70%), and statistics and epidemiology (3/4, 75%). OpenAI o1 fell below 80% accuracy only in statistics and epidemiology, with a single error (3/4, 75%). The AMIR consensus exhibited an accuracy lower than 80% in psychiatry (4/6, 67%). Average student performance, based on 5066 EstimAMIR-submitted templates, ranged from 44.1% in miscellaneous and basic sciences to 71.6% in immunology.</p></sec><sec id="s3-3"><title>Question Difficulty</title><p>As shown in <xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="fig" rid="figure4">Figure 4</xref> the performance of GPT-4o, OpenAI o1, and the AMIR consensus across quintiles of question difficulty, defined based on student performance. A statistically significant gradient is observed among these 3 study arms, with accuracy decreasing as question difficulty increases (crude <italic>P</italic> values: GPT-4o, <italic>P</italic>=.003; OpenAI o1, <italic>P</italic>=.04; AMIR consensus, <italic>P</italic>=.008; Benjamini-Hochberg adjusted <italic>P</italic> values: GPT-4o, <italic>P</italic>=.03; OpenAI o1, <italic>P</italic>=.10&#x2014;the only case not reaching statistical significance; AMIR consensus, <italic>P</italic>=.03). The decline in performance is particularly pronounced in the highest difficulty quintile. Differences between study arms within each difficulty quintile do not reach statistical significance.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Absolute and relative performance of GPT-4o (second iteration), OpenAI o1 (second iteration), and AMIR consensus by quintiles of question difficulty defined by students&#x2019; performance.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question difficulty</td><td align="left" valign="bottom">Number of questions</td><td align="left" valign="bottom">GPT-4o, n (%)</td><td align="left" valign="bottom">OpenAI o1, n (%)</td><td align="left" valign="bottom">AMIR consensus, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Very easy</td><td align="left" valign="top">32</td><td align="left" valign="top">32 (100)</td><td align="left" valign="top">32 (100)</td><td align="left" valign="top">32 (100)</td></tr><tr><td align="left" valign="top">Easy</td><td align="left" valign="top">56</td><td align="left" valign="top">53 (95)</td><td align="left" valign="top">54 (96)</td><td align="left" valign="top">53 (95)</td></tr><tr><td align="left" valign="top">Intermediate</td><td align="left" valign="top">39</td><td align="left" valign="top">35 (90)</td><td align="left" valign="top">36 (92)</td><td align="left" valign="top">38 (97)</td></tr><tr><td align="left" valign="top">Difficult</td><td align="left" valign="top">36</td><td align="left" valign="top">32 (89)</td><td align="left" valign="top">32 (89)</td><td align="left" valign="top">34 (94)</td></tr><tr><td align="left" valign="top">Very difficult</td><td align="left" valign="top">13</td><td align="left" valign="top">8 (62)</td><td align="left" valign="top">10 (77)</td><td align="left" valign="top">9 (69)</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">176</td><td align="left" valign="top">160 (90.9)</td><td align="left" valign="top">164 (93.2)</td><td align="left" valign="top">166 (94.3)</td></tr></tbody></table></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Performance of GPT-4o (2nd iteration), OpenAI o1 (2nd iteration), and AMIR consensus by quintiles of question difficulty defined by students&#x2019; performance.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75452_fig04.png"/></fig></sec><sec id="s3-4"><title>Clinical Cases versus Theoretical Questions</title><p>As shown in <xref ref-type="table" rid="table3">Table 3</xref> the performance of GPT-4o, OpenAI o1, the AMIR consensus, and students when answering clinical cases versus theoretical questions. Overall, a slightly higher accuracy was observed for clinical cases compared to theoretical questions, although these differences do not reach statistical significance in any study arm. Statistically significant differences between study arms both for clinical cases and theoretical questions were observed only when the students' arm was included in the analysis.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance of GPT-4o (second iteration), OpenAI o1 (second iteration), AMIR consensus, and students by questions being clinical cases or theoretical questions.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Clinical cases (n=105), n (%)</td><td align="left" valign="bottom">Theoretical questions (n=71), n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o</td><td align="char" char="." valign="top">98 (93.3)</td><td align="char" char="." valign="top">62 (87)</td><td align="char" char="." valign="top">.55</td></tr><tr><td align="left" valign="top">OpenAI o1</td><td align="char" char="." valign="top">99 (94.2)</td><td align="char" char="." valign="top">65 (92)</td><td align="char" char="." valign="top">.68</td></tr><tr><td align="left" valign="top">AMIR consensus</td><td align="char" char="." valign="top">99 (94.2)</td><td align="char" char="." valign="top">67 (94)</td><td align="char" char="." valign="top">.99</td></tr><tr><td align="left" valign="top">Students</td><td align="char" char="." valign="top">62 (59)</td><td align="char" char="." valign="top">38 (54)</td><td align="char" char="." valign="top">.68</td></tr></tbody></table></table-wrap></sec><sec id="s3-5"><title>Positive Versus Negative Questions</title><p><xref ref-type="table" rid="table4">Table 4</xref> shows the performance of GPT-4o, OpenAI o1, the AMIR consensus, and students when answering positive versus negative questions. Overall, accuracy was higher for positive questions than for negative ones, with the difference reaching statistical significance only for GPT-4o (<italic>P</italic>=.01). Statistically significant differences between study arms for both positive and negative questions are observed only when the students' arm is included in the analysis.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance of GPT-4o (second iteration), OpenAI o1 (second iteration), AMIR consensus, and students by questions being positive or negative.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Positive questions (n=140), n (%)</td><td align="left" valign="bottom">Negative questions (n=36), n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">132 (94.2)</td><td align="left" valign="top">28 (78)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">OpenAI o1</td><td align="left" valign="top">132 (94.2)</td><td align="left" valign="top">32 (89)</td><td align="left" valign="top">.40</td></tr><tr><td align="left" valign="top">AMIR consensus</td><td align="left" valign="top">132 (94.2)</td><td align="left" valign="top">34 (94)</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Students</td><td align="left" valign="top">82 (58.5)</td><td align="left" valign="top">18 (49)</td><td align="left" valign="top">.55</td></tr></tbody></table></table-wrap></sec><sec id="s3-6"><title>Response Consistency</title><p>Response consistency was assessed using the simple agreement percentage between the first and second iterations of GPT-4o (172/176, 97.7%) and OpenAI o1 (170/176, 96.6%). The Cohen &#x03BA; coefficient was 0.97 for GPT-4o and 0.95 for OpenAI o1, indicating almost perfect agreement (<italic>P</italic>&#x003C;.001 in both cases).</p><p>When analyzing individually the questions in which there was no concordance between the first and second iterations, it was observed that, for GPT-4o, 4 initially incorrect responses were modified: in 2 cases, the second response was also incorrect, while in the other 2 cases, the second response became correct. For OpenAI o1, 5 initially incorrect responses were modified: in 3 cases, the second response was again incorrect, and in 2 cases, the second response became correct. In addition, 1 initially correct response was modified, with the second response becoming incorrect.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study highlights the exceptional performance of both LLMs analyzed&#x2014;GPT-4o and OpenAI o1&#x2014;on the MIR 2024 examination. Both models achieved or exceeded a 90% accuracy rate, significantly outperforming the average human candidate as well as the top 10% of examinees [<xref ref-type="bibr" rid="ref18">18</xref>]. The expert consensus from AMIR instructors yielded even higher accuracy. Although this result should be interpreted in the context of unrestricted access to textbooks, scientific literature, and AI tools such as GPT, the reported use of these resources was minimal and reserved for clarifying ambiguous questions, never substituting for the group discussion and consensus process for each item. Results from the expert consensus suggest the added value of human expertise when synergistically combined with AI capabilities.</p><p>The challenged questions were excluded from the final analysis to remain faithful to the actual examination. Upon examination, these proved to be difficult items (on average, candidates answered them correctly in 31.3% of cases, compared with 56.6% for the other questions), which the LLMs managed more accurately than the human experts (5/5 for GPT-4o, 4/5 for OpenAI o1, and 2/5 for AMIR instructors).</p><p>These results were consistent across the different medical subjects analyzed. Interestingly, when question difficulty was assessed based on human performance, a similar trend was observed in the LLMs, with accuracy decreasing as question difficulty increased. Additionally, a slightly higher accuracy was observed for clinical cases compared to theoretical questions, as well as for positive questions compared to negative ones. This resemblance to human reasoning and performance could be rooted in the input used to train LLMs.</p><p>Both GPT-4o and OpenAI o1 demonstrated great consistency in their answers, with statistically significant near-perfect agreement between the first and second iterations. Furthermore, it is particularly interesting to note that, of the 10 responses that were altered between the first and second iterations (4 for GPT-4o and 6 for OpenAI o1), 9 were initially incorrect (with 4 of these changing to a correct response), and only 1 initially correct response was changed to an incorrect one. It is remarkable in favor of these LLMs that, considering their exceptional accuracy in the first iteration, the few changes occurring in the second iteration almost exclusively involved some of the few initially incorrect responses.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Several previous studies have evaluated the performance of GPT-3.5 and GPT-4 across different medical disciplines, as well as on national medical board examinations [<xref ref-type="bibr" rid="ref7">7</xref>]. For instance, Gilson et al assessed ChatGPT&#x2019;s performance on various sets of USMLE step 1 and step 2 questions, reporting an accuracy range of 42% to 64% [<xref ref-type="bibr" rid="ref8">8</xref>], which aligns with another study that found a 56% accuracy rate on a set of USMLE step 1-style questions [<xref ref-type="bibr" rid="ref9">9</xref>]. Knoedler et al examined ChatGPT-3.5 and ChatGPT-4 on USMLE step 3 questions, reporting 57% accuracy for GPT-3.5 and 85% for GPT-4 [<xref ref-type="bibr" rid="ref10">10</xref>]. Takagi et al evaluated these models on the Japanese Medical Licensing Examination, finding 51% accuracy for GPT-3.5 and 80% for GPT-4 [<xref ref-type="bibr" rid="ref20">20</xref>]. Meyer et al conducted a similar study on the written German medical licensing examination, with accuracy rates of 58% for GPT-3.5 and 85% for GPT-4 [<xref ref-type="bibr" rid="ref21">21</xref>]. A study by Prazeres [<xref ref-type="bibr" rid="ref22">22</xref>] on the Portuguese national examination for access to specialized training reported 54% accuracy for GPT-3.5 turbo and 65% for GPT-4o mini. Guillen-Grima et al [<xref ref-type="bibr" rid="ref13">13</xref>] published the perhaps most comparable study, as they compared GPT-3.5 and GPT-4 on the MIR 2022 examination. Accuracy rates were 63% for GPT-3.5 and 87% for GPT-4. Interestingly, our results show the highest accuracy for LLMs among all the aforementioned studies, almost matching the consensus from expert human instructors. This may be a result of the gradual development of LLMs with time. It poses relevant questions regarding how medical education and examinations should be shaped in the future, both in terms of content and the skills that are underscored. As stated in a previous editorial, these results make it important to consider the necessity of more emphasis on soft skills and critical thinking rather than plain memorization [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s4-3"><title>Strengths</title><p>This study offers new insights into the accuracy of GPT-4o and OpenAI o1 in a national medical specialty access examination. To date, the only comparable research published in an indexed journal that we have identified is a recent study by Liu et al [<xref ref-type="bibr" rid="ref24">24</xref>], which evaluated GPT-4o&#x2019;s performance on the Japanese national medical examination, reporting an accuracy of 89%.</p><p>Moreover, this study reinforces the trend that each newly developed LLM exhibits improved accuracy compared to its predecessors. Additionally, our secondary analysis proved that the accuracy of both GPT-4o and OpenAI o1 aligned with difficulty levels of questions based on human candidates&#x2019; performance.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations. First, increasing the sample size&#x2014;that is, including a larger number of questions in the analysis&#x2014;would have provided a more robust insight into the different subanalyses performed. For instance, it would have allowed us to investigate whether performance differences of LLMs compared to human experts&#x2014;such as those suggested in unique medical subjects like psychiatry&#x2014;are truly meaningful or just the results of random variation. Second, additional secondary analyses could have been of interest, such as examining the relationship between the number of words or characters in each question and the performance of LLMs; the influence of specific expressions or wording styles on model accuracy; the impact of different languages on performance; and the effect of alternative prompting formulas on accuracy. Third, although image-based questions are part of the MIR examination, they were not included in this study because OpenAI o1 does not support image inputs, and fair comparability between LLMs was prioritized. This decision reduces methodological bias&#x2014;LLMs are not artificially penalized for lacking multimodal capabilities&#x2014;and increases the internal validity of between-arms comparisons. However, it may reduce the representativeness of the performance evaluation and limit the generalizability of our findings to the actual test setting, where visual interpretation is an integral component of clinical reasoning. Previous studies suggest LLMs may exhibit reasonable performance on image-based questions even without access to the image itself [<xref ref-type="bibr" rid="ref25">25</xref>]. Fourth, the AMIR consensus may not represent a pure benchmark of human expert knowledge, as experts had access to textbooks and generative AI. However, the faculty use of these resources was minimal and strictly advisory, with all final answers determined by expert discussion and consensus, indicating that the potential for significant bias was low. Fifth, the field of LLMs is continuously evolving, and several new models have been released in recent months that were not analyzed in this study, including GPT-4.5 by OpenAI [<xref ref-type="bibr" rid="ref26">26</xref>], DeepSeek-R1 by DeepSeek-AI [<xref ref-type="bibr" rid="ref27">27</xref>], Qwen 2.5 by Alibaba [<xref ref-type="bibr" rid="ref28">28</xref>], LlaMa 3.2 by Meta AI [<xref ref-type="bibr" rid="ref29">29</xref>], and Claude 3.7 Sonnet by Anthropic [<xref ref-type="bibr" rid="ref30">30</xref>], among others. Sixth, student results were self-reported, which could be a source of bias. Finally, caution should be exercised when generalizing LLM accuracy on the MIR examination to other national medical licensing examinations or to different fields and tasks within medical education.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study highlights the excellent performance of the two analyzed LLMs&#x2014;GPT-4o and OpenAI o1&#x2014;on the MIR 2024 examination, demonstrating strong consistency across different medical subjects and types of questions, as well as between first and second iterations.</p><p>The integration of LLMs into medical education is promising and likely to revolutionize the field and change our understanding of medical education. Further research is needed to explore how wording, language, prompting techniques, and image-based questions influence LLM accuracy in national medical licensing examinations, as well as to assess the performance of other emerging models. More research is also needed to better understand the potential usefulness of these tools as learning assistants in broader educational contexts.</p></sec></sec></body><back><ack><p>The authors thank the students who entered their MIR examination templates into the EstimAMIR application, which provided a benchmark for assessing the performance of the LLMs evaluated in this study. The authors also appreciate the interest of the AMIR Academy faculty in this study, especially the instructors, whose responses to the MIR 2024 examination questions allowed us to design the comparison group called AMIR consensus.</p></ack><notes><sec><title>Funding</title><p>The expenses associated with the preparation and publication of this study were funded by Healthcademia.</p></sec><sec><title>Data Availability</title><p>The data from this study are available upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: CCC (lead); PB and MIJ (equal)</p><p>Data curation: PB and CCC (lead); MIJ, PGC, PJFE, MCP, IBS, PGB, MJL, and BCO (equal)</p><p>Formal analysis: PB (lead), MIJ (supporting)</p><p>Project administration: CCC (lead), PB (supporting)</p><p>Writing &#x2013; original draft: PB</p><p>Writing &#x2013; review &#x0026; editing: PB, CCC, MIJ, PGC, PJFE, MCP, IBS, PGB, MJL, BCO (equal)</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>ABBREVIATIONS</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CoT</term><def><p>chain of thought</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MIR</term><def><p>M&#x00E9;dico Interno Residente</p></def></def-item><def-item><term id="abb5">RL</term><def><p>reinforcement learning</p></def></def-item><def-item><term id="abb6">USMLE</term><def><p>United States medical licensing examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>WX</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A survey of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 31, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.18223</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Improving language understanding by generative pre-training</article-title><year>2018</year><access-date>2025-12-10</access-date><publisher-name>OpenAI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>L</given-names> </name><name name-style="western"><surname>Akkaya</surname><given-names>I</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Jaech</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kalai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Richardson</surname><given-names>A</given-names> </name><name name-style="western"><surname>El-Kishky</surname><given-names>A</given-names> </name><etal/></person-group><article-title>OpenAI o1 system card</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 21, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.16720</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Do advanced language models eliminate the need for prompt engineering in software engineering?</article-title><source>ACM Trans Softw Eng Methodol</source><year>2025</year><pub-id pub-id-type="doi">10.1145/3771933</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Caldarini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jaf</surname><given-names>S</given-names> </name><name name-style="western"><surname>McGarry</surname><given-names>K</given-names> </name></person-group><article-title>A literature survey of recent advances in chatbots</article-title><source>Information</source><year>2022</year><volume>13</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.3390/info13010041</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hallquist</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>I</given-names> </name><name name-style="western"><surname>Montalbano</surname><given-names>M</given-names> </name><name name-style="western"><surname>Loukas</surname><given-names>M</given-names> </name></person-group><article-title>Applications of artificial intelligence in medical education: a systematic review</article-title><source>Cureus</source><year>2025</year><month>03</month><volume>17</volume><issue>3</issue><fpage>e79878</fpage><pub-id pub-id-type="doi">10.7759/cureus.79878</pub-id><pub-id pub-id-type="medline">40034416</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Knoedler</surname><given-names>L</given-names> </name><name name-style="western"><surname>Knoedler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><etal/></person-group><article-title>In-depth analysis of ChatGPT&#x2019;s performance based on specific signaling words and phrases in the question stem of 2377 USMLE step 1 style questions</article-title><source>Sci Rep</source><year>2024</year><month>06</month><day>12</day><volume>14</volume><issue>1</issue><fpage>13553</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-63997-7</pub-id><pub-id pub-id-type="medline">38866891</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Knoedler</surname><given-names>L</given-names> </name><name name-style="western"><surname>Alfertshofer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Knoedler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Pure wisdom or Potemkin villages? A comparison of ChatGPT 3.5 and ChatGPT 4 on USMLE step 3 style questions: quantitative analysis</article-title><source>JMIR Med Educ</source><year>2024</year><month>01</month><day>5</day><volume>10</volume><fpage>e51148</fpage><pub-id pub-id-type="doi">10.2196/51148</pub-id><pub-id pub-id-type="medline">38180782</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Du</surname><given-names>M</given-names> </name></person-group><article-title>Quantifying multilingual performance of large language models across languages</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.11553</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>K</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bi</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for large language models: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.10997</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guillen-Grima</surname><given-names>F</given-names> </name><name name-style="western"><surname>Guillen-Aguinaga</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guillen-Aguinaga</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating the efficacy of ChatGPT in navigating the Spanish medical residency entrance examination (MIR): promising horizons for AI in clinical medicine</article-title><source>Clin Pract</source><year>2023</year><month>11</month><day>20</day><volume>13</volume><issue>6</issue><fpage>1460</fpage><lpage>1487</lpage><pub-id pub-id-type="doi">10.3390/clinpract13060130</pub-id><pub-id pub-id-type="medline">37987431</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flores-Cohaila</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Garc&#x00ED;a-Vicente</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vizcarra-Jim&#x00E9;nez</surname><given-names>SF</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on the Peruvian national licensing medical examination: cross-sectional study</article-title><source>JMIR Med Educ</source><year>2023</year><month>09</month><day>28</day><volume>9</volume><fpage>e48039</fpage><pub-id pub-id-type="doi">10.2196/48039</pub-id><pub-id pub-id-type="medline">37768724</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Altermatt</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Neyem</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sumonte</surname><given-names>NI</given-names> </name><etal/></person-group><article-title>Evaluating GPT-4o in high-stakes medical assessments: performance and error analysis on a Chilean anesthesiology exam</article-title><source>BMC Med Educ</source><year>2025</year><month>10</month><day>27</day><volume>25</volume><issue>1</issue><fpage>1499</fpage><pub-id pub-id-type="doi">10.1186/s12909-025-08084-9</pub-id><pub-id pub-id-type="medline">41146119</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madrid-Garc&#x00ED;a</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rosales-Rosado</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Freites-Nu&#x00F1;ez</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Harnessing ChatGPT and GPT-4 for evaluating the rheumatology questions of the Spanish access exam to specialized medical training</article-title><source>Sci Rep</source><year>2023</year><month>12</month><day>13</day><volume>13</volume><issue>1</issue><fpage>22129</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-49483-6</pub-id><pub-id pub-id-type="medline">38092821</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="report"><article-title>Order SND/888/2024 of 14 august, approving the number of available positions and announcing the 2024 competitive selection examinations for access in 2025 to specialized health training positions for university degree holders in medicine, pharmacy, nursing, and in the fields of psychology, chemistry, biology, and physics [Article in Spanish]</article-title><year>2024</year><access-date>2025-01-07</access-date><publisher-name>Bolet&#x00ED;n Oficial del Estado</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.boe.es/boe/dias/2024/08/23/pdfs/BOE-A-2024-17246.pdf">https://www.boe.es/boe/dias/2024/08/23/pdfs/BOE-A-2024-17246.pdf</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>The Ministry of Health publishes the final list of results of the specialized health training examinations [Website in Spanish]</article-title><source>Ministry of Health, Spain</source><access-date>2025-03-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.sanidad.gob.es/gabinete/notasPrensa.do?id=6632">https://www.sanidad.gob.es/gabinete/notasPrensa.do?id=6632</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Consulting previous examination booklets&#x2014;search by examination session [Website in Spanish]</article-title><source>Ministry of Health, Spain</source><access-date>2025-01-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://fse.sanidad.gob.es/fseweb/#/principal/datosAnteriores/cuadernosExamen">https://fse.sanidad.gob.es/fseweb/#/principal/datosAnteriores/cuadernosExamen</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Erabi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name></person-group><article-title>Performance of GPT-3.5 and GPT-4 on the Japanese medical licensing examination: comparison study</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>29</day><volume>9</volume><fpage>e48002</fpage><pub-id pub-id-type="doi">10.2196/48002</pub-id><pub-id pub-id-type="medline">37384388</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meyer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Riese</surname><given-names>J</given-names> </name><name name-style="western"><surname>Streichert</surname><given-names>T</given-names> </name></person-group><article-title>Comparison of the performance of GPT-3.5 and GPT-4 with that of medical students on the written German medical licensing examination: observational study</article-title><source>JMIR Med Educ</source><year>2024</year><month>02</month><day>8</day><volume>10</volume><fpage>e50965</fpage><pub-id pub-id-type="doi">10.2196/50965</pub-id><pub-id pub-id-type="medline">38329802</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prazeres</surname><given-names>F</given-names> </name></person-group><article-title>ChatGPT&#x2019;s performance on Portuguese medical examination questions: comparative analysis of ChatGPT-3.5 Turbo and ChatGPT-4o Mini</article-title><source>JMIR Med Educ</source><year>2025</year><month>03</month><day>5</day><volume>11</volume><fpage>e65108</fpage><pub-id pub-id-type="doi">10.2196/65108</pub-id><pub-id pub-id-type="medline">40043219</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mbakwe</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Lourentzou</surname><given-names>I</given-names> </name><name name-style="western"><surname>Celi</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Mechanic</surname><given-names>OJ</given-names> </name><name name-style="western"><surname>Dagan</surname><given-names>A</given-names> </name></person-group><article-title>ChatGPT passing USMLE shines a spotlight on the flaws of medical education</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000205</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000205</pub-id><pub-id pub-id-type="medline">36812618</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Evaluating the effectiveness of advanced large language models in medical knowledge: a comparative study using the Japanese national medical examination</article-title><source>Int J Med Inform</source><year>2025</year><month>01</month><volume>193</volume><fpage>105673</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105673</pub-id><pub-id pub-id-type="medline">39471700</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gravina</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Pellegrino</surname><given-names>R</given-names> </name><name name-style="western"><surname>Palladino</surname><given-names>G</given-names> </name><name name-style="western"><surname>Imperio</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ventura</surname><given-names>A</given-names> </name><name name-style="western"><surname>Federico</surname><given-names>A</given-names> </name></person-group><article-title>Charting new AI education in gastroenterology: cross-sectional evaluation of ChatGPT and perplexity AI in medical residency exam</article-title><source>Dig Liver Dis</source><year>2024</year><month>08</month><volume>56</volume><issue>8</issue><fpage>1304</fpage><lpage>1311</lpage><pub-id pub-id-type="doi">10.1016/j.dld.2024.02.019</pub-id><pub-id pub-id-type="medline">38503659</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="report"><article-title>OpenAI GPT-4.5 system card</article-title><year>2025</year><access-date>2026-01-06</access-date><publisher-name>OpenAI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/gpt-4-5-system-card-2272025.pdf">https://cdn.openai.com/gpt-4-5-system-card-2272025.pdf</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>DeepSeek-AI</collab><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><etal/></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in llms via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 22, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Qwen</collab><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen2.5 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.15115</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="report"><article-title>Claude 3.7 sonnet system card</article-title><access-date>2026-01-07</access-date><publisher-name>Anthropic</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://assets.anthropic.com/m/785e231869ea8b3b/original/claude-3-7-sonnet-system-card.pdf">https://assets.anthropic.com/m/785e231869ea8b3b/original/claude-3-7-sonnet-system-card.pdf</ext-link></comment></nlm-citation></ref></ref-list></back></article>