<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e65108</article-id><article-id pub-id-type="doi">10.2196/65108</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>ChatGPT&#x2019;s Performance on Portuguese Medical Examination Questions: Comparative Analysis of ChatGPT-3.5 Turbo and ChatGPT-4o Mini</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Prazeres</surname><given-names>Filipe</given-names></name><degrees>MD, MSc, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Faculty of Health Sciences, University of Beira Interior</institution><addr-line>Av. Infante D. Henrique</addr-line><addr-line>Covilh&#x00E3;</addr-line><country>Portugal</country></aff><aff id="aff2"><institution>Family Health Unit Beira Ria</institution><addr-line>Gafanha da Nazar&#x00E9;</addr-line><country>Portugal</country></aff><aff id="aff3"><institution>CINTESIS@RISE, Department of Community Medicine, Information and Health Decision Sciences, Faculty of Medicine of the University of Porto</institution><addr-line>Porto</addr-line><country>Portugal</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Thies</surname><given-names>Bill</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Digiampietri</surname><given-names>Luciano Antonio</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Pellegrino</surname><given-names>Raffaele</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Filipe Prazeres, MD, MSc, PhD, Faculty of Health Sciences, University of Beira Interior, Av. Infante D. Henrique, Covilh&#x00E3;, 6201-506, Portugal, 351 234393150; <email>filipeprazeresmd@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>5</day><month>3</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e65108</elocation-id><history><date date-type="received"><day>05</day><month>08</month><year>2024</year></date><date date-type="rev-recd"><day>30</day><month>11</month><year>2024</year></date><date date-type="accepted"><day>12</day><month>12</month><year>2024</year></date></history><copyright-statement>&#x00A9; Filipe Prazeres. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 5.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e65108"/><abstract><sec><title>Background</title><p>Advancements in ChatGPT are transforming medical education by providing new tools for assessment and learning, potentially enhancing evaluations for doctors and improving instructional effectiveness.</p></sec><sec><title>Objective</title><p>This study evaluates the performance and consistency of ChatGPT-3.5 Turbo and ChatGPT-4o mini in solving European Portuguese medical examination questions (2023 National Examination for Access to Specialized Training; Prova Nacional de Acesso &#x00E0; Forma&#x00E7;&#x00E3;o Especializada [PNA]) and compares their performance to human candidates.</p></sec><sec sec-type="methods"><title>Methods</title><p>ChatGPT-3.5 Turbo was tested on the first part of the examination (74 questions) on July 18, 2024, and ChatGPT-4o mini on the second part (74 questions) on July 19, 2024. Each model generated an answer using its natural language processing capabilities. To test consistency, each model was asked, &#x201C;Are you sure?&#x201D; after providing an answer. Differences between the first and second responses of each model were analyzed using the McNemar test with continuity correction. A single-parameter <italic>t</italic> test compared the models&#x2019; performance to human candidates. Frequencies and percentages were used for categorical variables, and means and CIs for numerical variables. Statistical significance was set at <italic>P</italic>&#x003C;.05.</p></sec><sec sec-type="results"><title>Results</title><p>ChatGPT-4o mini achieved an accuracy rate of 65% (48/74) on the 2023 PNA examination, surpassing ChatGPT-3.5 Turbo. ChatGPT-4o mini outperformed medical candidates, while ChatGPT-3.5 Turbo had a more moderate performance.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study highlights the advancements and potential of ChatGPT models in medical education, emphasizing the need for careful implementation with teacher oversight and further research.</p></sec></abstract><kwd-group><kwd>ChatGPT-3.5 Turbo</kwd><kwd>ChatGPT-4o mini</kwd><kwd>medical examination</kwd><kwd>European Portuguese</kwd><kwd>AI performance evaluation</kwd><kwd>Portuguese</kwd><kwd>evaluation</kwd><kwd>medical examination questions</kwd><kwd>examination question</kwd><kwd>chatbot</kwd><kwd>ChatGPT</kwd><kwd>model</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>GPT</kwd><kwd>LLM</kwd><kwd>NLP</kwd><kwd>natural language processing</kwd><kwd>machine learning</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Generative artificial intelligence (AI) represents a branch of AI dedicated to the development of systems that can autonomously generate high-quality digital content on demand, and it can do so across various modalities, such as written text, images, audio, and video [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Generative AI tools are trained on large datasets, enabling them to produce work that mirrors human-created content [<xref ref-type="bibr" rid="ref2">2</xref>]. Nowadays, there are several examples of generative AI tools, including ChatGPT (OpenAI Inc), Runway, Gemini (Google Inc), DALL-E (OpenAI Inc), Copilot (Microsoft Inc), Midjourney, NovelAI (Anlatan), Claude (Anthropic), and Jasper AI, among others. ChatGPT, the large language model (LLM) chatbot, developed by OpenAI [<xref ref-type="bibr" rid="ref4">4</xref>], that started the AI boom in November 2022, became the most popular AI tool of 2023, accounting for over 60.2% of visits between September 2022 and August 2023, with a total of 14.6 billion website visits [<xref ref-type="bibr" rid="ref5">5</xref>]. ChatGPT&#x2019;s availability as a free-to-use, low-bandwidth service may reduce disparities compared to paid versions or models by making advanced AI technology accessible to a broader and more diverse global population [<xref ref-type="bibr" rid="ref6">6</xref>], contributing to making it the most popular generative AI tool [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Recent literature reviews regarding AI have shown that this type of technology has potential applications in several fields, spanning from the architecture, engineering, and construction industry to health care [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. The possible applications in medicine are substantial, ranging from diagnostic and treatment support (eg, clinical imaging improvement, classification of diseases, prediction of disease onset, development of treatment, and medication prescriptions) [<xref ref-type="bibr" rid="ref12">12</xref>] to facilitate communication and engagement between medical professionals and their patients [<xref ref-type="bibr" rid="ref13">13</xref>], and also improving medical education and its accessibility [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. For example, ChatGPT can be used as a study tool to clearly explain complex medical concepts [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>] (eg, radiology reports [<xref ref-type="bibr" rid="ref18">18</xref>]), create memory aids for challenging topics, clarify medical practice questions, summarize research articles, compile lists of differential diagnoses [<xref ref-type="bibr" rid="ref17">17</xref>], generate medical examination questions [<xref ref-type="bibr" rid="ref19">19</xref>], and simulate physician-patient interactions [<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Medical written examinations are an important part in evaluating the competence and knowledge of medical students and graduates (eg, access of physicians to specialized training, such is the case in Portugal). These examinations not only test factual knowledge but also evaluate the critical thinking and problem-solving skills of the candidates. With the recent growing interest in AI, an important question arises: Can AI, specifically ChatGPT, perform at a level comparable to human candidates in medical written examinations? By evaluating ChatGPT&#x2019;s ability to correctly answer medical questions, its medical proficiency and its potential role as an educational tool can be assessed. Successfully completing this task can demonstrate ChatGPT&#x2019;s capability to serve as a resource for medical students by providing continuous access to information, particularly benefiting students in remote or under-resourced areas [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>ChatGPT is known for having the capability of performing near the passing threshold of 60% accuracy of the United States Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref20">20</xref>] and for approximately having the knowledge equivalent to a third-year medical student [<xref ref-type="bibr" rid="ref21">21</xref>]. ChatGPT&#x2019;s performance on medical examinations has been analyzed across different countries and questions. A 2023 systematic review with a meta-analysis of 19 articles found a mean performance of ChatGPT of around 61% [<xref ref-type="bibr" rid="ref22">22</xref>], and a more recent review published in 2024 concluded that, despite ChatGPT&#x2019;s satisfactory performance in examinations, further studies are necessary to fully explore its potential in medical education [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Furthermore, ChatGPT struggles with non-English language assessments possibly due to a limited understanding of linguistic nuances and Western-centric internet data, which may not fully represent the clinical and disease differences in some countries, like African and Asian populations [<xref ref-type="bibr" rid="ref24">24</xref>], warranting more studies in other languages to ensure better understanding of ChatGPT&#x2019;s accuracy in diverse cultural contexts. For example, ChatGPT performed considerably lower on a medical examination in Chinese (45.8% correct answers on the Chinese National Medical Licensing Examination) [<xref ref-type="bibr" rid="ref25">25</xref>], and even worse in the French examination with 22% correct answers [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>In July 2024, OpenAI launched GPT-4o mini, a smaller version of its latest GPT-4o (&#x201C;o&#x201D; for &#x201C;omni&#x201D;) AI language model. This new model replaced GPT-3.5 Turbo in ChatGPT, making this an ideal time to study the performance of both free models in resolving written medical examinations.</p><p>This study aims to evaluate the performance and consistency of 2 AI models, ChatGPT-3.5 Turbo and ChatGPT-4o mini, in solving the questions of a non-English language (European Portuguese) written medical examination, with a format of multiple-choice with one best answer&#x2014;the 2023 National Examination for Access to Specialized Training (Prova Nacional de Acesso &#x00E0; Forma&#x00E7;&#x00E3;o Especializada [PNA])&#x2014;and compare their performance to that of human candidates.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>The PNA examination is part of the requirements for entering specialized medical training in Portugal. Its purpose is to rank candidates for accessing specialized training vacancies, so no minimum passing grade is needed [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>The PNA questions used in this study were from the actual 2023 Portuguese PNA examination, which is publicly available on the web [<xref ref-type="bibr" rid="ref27">27</xref>]. This examination includes 150 questions with 5 multiple-choice answers each, with only a single best answer, similar to the USMLE. The questions are based on clinical vignettes and divided into 2 parts with 75 questions each. The examination emphasizes clinical reasoning and the application and integration of clinical knowledge and is scored on a scale from 0 to 150 points, with no penalties for blank or incorrect answers. It covers various medical disciplines, including medicine, surgery, pediatrics, gynecology and obstetrics, and psychiatry. The examination duration is 240 minutes, divided into 2 parts of 120 minutes each [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>ChatGPT-3.5 Turbo was provided with the first part of the examination (74 no image-based multiple-choice questions [MCQs]) on July 18, 2024, and ChatGPT-4o mini with the second part of the examination (74 no image-based MCQs) on July 19, 2024. The questions were entered into the models in European Portuguese and in a format similar to how they are presented to human candidates, and each model was requested to provide a single-letter answer, just like human candidates. For each question, the models generated an answer using their natural language processing capabilities. Following each model&#x2019;s response, a follow-up question, &#x201C;Are you sure?&#x201D; was asked to test for consistency&#x2014;this technique was previously used by Brin et al [<xref ref-type="bibr" rid="ref28">28</xref>]. An example of the input format of the questions and the respective responses by ChatGPT in European Portuguese is depicted in <xref ref-type="table" rid="table1">Table 1</xref>, with corresponding translations to English performed by ChatGPT-4o mini. Each question was addressed in a new chat session to reduce the potential influence of memory retention bias of ChatGPT.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Example of the input format of the questions and the respective responses by ChatGPT.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Examination question in European Portuguese</td><td align="left" valign="bottom">Examination question translated to English (performed by ChatGPT-4o mini)</td></tr></thead><tbody><tr><td align="left" valign="top">Question</td><td align="left" valign="top">Um homem de 73 anos vem &#x00E0; consulta hospitalar para reavalia&#x00E7;&#x00E3;o de doen&#x00E7;a pulmonar obstrutiva cr&#x00F3;nica. Na consulta pr&#x00E9;via, h&#x00E1; seis meses, apresentava-se em estadio GOLD B. Refere agora, desde h&#x00E1; tr&#x00EA;s meses, agravamento da dispneia para esfor&#x00E7;os m&#x00E9;dios, sem altera&#x00E7;&#x00E3;o do padr&#x00E3;o habitual de tosse ou de expetora&#x00E7;&#x00E3;o. Nega febre, perda de apetite ou outras queixas de novo. A hist&#x00F3;ria m&#x00E9;dica revela ainda obesidade. A medica&#x00E7;&#x00E3;o habitual inclui brometo de tiotr&#x00F3;pio e salmeterol. &#x00C9; ex-fumador de 40 UMA desde h&#x00E1; 10 anos. Os sinais vitais s&#x00E3;o temperatura 36 &#x00B0;C, frequ&#x00EA;ncia respirat&#x00F3;ria 18/min, frequ&#x00EA;ncia card&#x00ED;aca 78/min e press&#x00E3;o arterial 115/89 mm Hg; SpO2 94% (ar ambiente). Ao exame f&#x00ED;sico apresenta cianose labial, com apar&#x00EA;ncia confort&#x00E1;vel e atrofia muscular na &#x00E1;rea temporal. A ausculta&#x00E7;&#x00E3;o pulmonar revela crepita&#x00E7;&#x00F5;es raras dispersas, sem prolongamento do tempo expirat&#x00F3;rio. O restante exame f&#x00ED;sico, incluindo a ausculta&#x00E7;&#x00E3;o card&#x00ED;aca, encontra-se dentro dos par&#x00E2;metros de normalidade.<break/>Qual das seguintes alternativas mais provavelmente reduz o risco de morte relacionada com esta doen&#x00E7;a?<list list-type="alpha-upper"><list-item><p>Azitromicina.</p></list-item><list-item><p>Montelucaste.</p></list-item><list-item><p>Oxigenoterapia.</p></list-item><list-item><p>Prednisolona.</p></list-item><list-item><p>Teofilina.</p></list-item></list><break/>Indique apenas a letra da resposta, sem justifica&#x00E7;&#x00E3;o.</td><td align="left" valign="top">A 73-year-old man comes to the hospital for reevaluation of chronic obstructive pulmonary disease (COPD). At the previous visit six months ago, he was in GOLD stage B. He now reports worsening shortness of breath with moderate exertion over the past three months, without changes in his usual cough or sputum production. He denies fever, loss of appetite, or any new complaints. His medical history also reveals obesity. His usual medications include tiotropium bromide and salmeterol. He is a former smoker with a 40 pack-year history, quit 10 years ago. Vital signs are temperature 36 &#x00B0;C, respiratory rate 18/min, heart rate 78/min, and blood pressure 115/89 mm Hg; SpO2 94% (room air). On physical examination, he shows lip cyanosis, appears comfortable, and has muscle atrophy in the temporal area. Lung auscultation reveals scattered rare crackles, with no prolonged expiratory phase. The rest of the physical examination, including cardiac auscultation, is within normal limits.<break/>Which of the following is most likely to reduce the risk of death related to this condition?<list list-type="alpha-upper"><list-item><p>Azithromycin.</p></list-item><list-item><p>Montelukast.</p></list-item><list-item><p>Oxygen therapy.</p></list-item><list-item><p>Prednisolone.</p></list-item><list-item><p>Theophylline.</p></list-item></list><break/>Please indicate only the letter of the answer, without justification.</td></tr><tr><td align="left" valign="top">ChatGPT-3.5 Turbo answer</td><td align="left" valign="top">A resposta correta &#x00E9;:<break/>(C) Oxigenoterapia.</td><td align="left" valign="top">The correct answer is:<break/>(C) Oxygen therapy.</td></tr><tr><td align="left" valign="top">Follow-up question</td><td align="left" valign="top">Tem a certeza?</td><td align="left" valign="top">Are you sure?</td></tr><tr><td align="left" valign="top">ChatGPT-3.5 Turbo answer</td><td align="left" valign="top">Pe&#x00E7;o desculpa pelo erro anterior. Vamos rever a situa&#x00E7;&#x00E3;o:<break/>Dadas as caracter&#x00ED;sticas do paciente com DPOC grave e hipoxemia (SpO2 de 94% em ar ambiente), a interven&#x00E7;&#x00E3;o que mais provavelmente reduzir&#x00E1; o risco de morte relacionada com esta doen&#x00E7;a &#x00E9; a oxigenoterapia. Portanto, a resposta correta &#x00E9;:<break/>(C) Oxigenoterapia.</td><td align="left" valign="top">I apologize for the previous error. Let&#x2019;s review the situation:<break/>Given the characteristics of the patient with severe COPD and hypoxemia (SpO2 of 94% on room air), the intervention that is most likely to reduce the risk of death related to this condition is oxygen therapy. Therefore, the correct answer is:<break/>(C) Oxygen therapy.</td></tr></tbody></table></table-wrap><p>Human candidates&#x2019; scores were obtained from the official records of the PNA 2023 examination, which are also publicly available on the web [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>To compare the mean scores of human candidates to those of the language models, all questions of the PNA 2023 examination had to be answered. Since the examination included 2 questions using images (one in the first part and another one in the second part; both with electrocardiogram strips), these questions were answered by GPT-4o, as it can handle images in addition to text.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study exclusively used data that had been previously published online and did not involve direct interaction with human participants. As a result, ethical guidelines pertaining to human participants are not applicable.</p></sec><sec id="s2-3"><title>Statistical Analysis</title><p>Analyses were performed using IBM SPSS Statistics (Version 21). The McNemar test [<xref ref-type="bibr" rid="ref29">29</xref>] with continuity correction [<xref ref-type="bibr" rid="ref30">30</xref>] was used to determine differences between the first and second responses of ChatGPT-3.5 Turbo and ChatGPT-4o mini. Single-parameter <italic>t</italic> test was used to compare the performance of ChatGPT-3.5 Turbo and ChatGPT-4o mini with that of human candidates. Frequencies and percentages were used for categorical variables and means and CIs for numerical variables. Statistical significance was considered at <italic>P</italic>&#x003C;.05.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overall Performance and Consistency</title><p>In the initial response with ChatGPT-3.5 Turbo, of the 74 questions, 40 (54%) answers were correct and 34 (46%) answers were incorrect. After the follow-up question, &#x201C;Are you sure?,&#x201D; the number of correct answers decreased to 28 (38%), while the number of incorrect answers increased to 46 (62%). This change occurred because ChatGPT-3.5 Turbo corrected 12 originally incorrect answers, but also changed 24 originally correct answers to incorrect. This pattern of change approached, but did not reach, significance (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=3.361, <italic>P</italic>=.067).</p><p>Initially, of the 74 questions, ChatGPT-4o mini produced 48 (65%) correct answers and 26 (35%) incorrect answers. After being asked, &#x201C;Are you sure?,&#x201D; the correct answers dropped to 42 (57%), while incorrect answers rose to 32 (43%). This change occurred because ChatGPT-4o mini fixed 12 previously wrong answers but also changed 18 previously correct answers to incorrect. This pattern of change was not statistically significant (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=0.833, <italic>P</italic>=.361).</p><p>The 2 questions using images (one in the first part and another one in the second part) were answered correctly by GPT-4o.</p></sec><sec id="s3-2"><title>LLM Chatbot Versus Human</title><p>When evaluating AI capabilities in relation to human abilities, LLM responses in part 1 of PNA (74 questions resolved by ChatGPT-3.5 Turbo plus 1 by GPT-4o) showed lower accuracy than human respondents. The human mean score was statistically significantly higher by 6.04 (95% CI 5.65-6.43) than the LLM score of 41 (<italic>P</italic>&#x003C;.001).</p><p>In part 2 of PNA (74 questions resolved by ChatGPT-4o mini added to 1 question by GPT-4o), the LLM score showed higher accuracy than human respondents. The human mean score was statistically significantly lower by 5.58 (95% CI 5.25-5.9) than the LLM score of 49 (<italic>P</italic>&#x003C;.001).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study analyzes the performance of 2 ChatGPT models (ChatGPT-3.5 Turbo and ChatGPT-4o mini) on the Portuguese medical written examination: 2023 National Examination for Access to Specialized Training, revealing important differences in accuracy and consistency. Although, both ChatGPT-3.5 Turbo and ChatGPT-4o mini answered correctly in the majority of the questions, ChatGPT-4o mini achieved a higher accuracy rate of 65% (48/74) compared to ChatGPT-3.5 Turbo&#x2019;s 54% (40/74), demonstrating a superior capability in handling medical questions. Additionally, ChatGPT-4o mini showed greater consistency in confirming answers, highlighting its reliability. When evaluated against human respondents, ChatGPT-4o mini outperformed the average human accuracy, while ChatGPT-3.5 Turbo fell short.</p></sec><sec id="s4-2"><title>Strengths</title><p>This study stands out for its innovative approach in analyzing the performance of ChatGPT-3.5 Turbo and ChatGPT-4o mini in a medical examination context. It is the first to evaluate these models using an examination conducted in a less commonly studied language, Portuguese, thereby broadening the scope of language-specific AI assessments. By incorporating the actual scores of human candidates for comparison, the study provides a robust benchmark against real-world performance. Furthermore, the research examines the stability of the AI&#x2019;s answers by repeatedly asking &#x201C;Are you sure?,&#x201D; offering valuable insights into the consistency of the responses.</p></sec><sec id="s4-3"><title>Comparison to Prior Work</title><p>A recent study evaluated ChatGPT&#x2019;s performance on medical licensing examinations across multiple countries (United States, Italy, France, Spain, United Kingdom, and India) and determined a variable accuracy, ranging from 22% on the French examination to 73% on the Italian examination [<xref ref-type="bibr" rid="ref26">26</xref>]. In this study, ChatGPT answered correctly in more than 50% of the Portuguese medical examination questions, positioning it next to the countries with better performance. For example, in a Turkish study, ChatGPT reached 70.9% accuracy in the medical specialty examination [<xref ref-type="bibr" rid="ref31">31</xref>]. In the Iranian medical licensing examination, ChatGPT performed with 68.5% of the questions answered correctly [<xref ref-type="bibr" rid="ref32">32</xref>]. And in Poland, ChatGPT achieved a 67.1% correct response rate on the Polish medical specialization licensing examination [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>When analyzing the differences between the 2 ChatGPT versions, ChatGPT-4o mini outperformed ChatGPT-3.5 Turbo in this study: 65% (48/74) vs 54% (40/74) correct response rate. This suggests that advancements in the underlying architecture and training data of ChatGPT-4o mini (knowledge up to October 2023) have improved its capability to understand and respond to medical questions with more accuracy. Previous studies evaluating the performance of different ChatGPT models found that ChatGPT-4 consistently performed better compared to ChatGPT-3.5. For example, ChatGPT-4 outperformed ChatGPT-3.5 on the Polish Medical Final Examination [<xref ref-type="bibr" rid="ref34">34</xref>], the Spanish Medical Residency Entrance Examination (M&#x00E9;dico Interno Residente) [<xref ref-type="bibr" rid="ref35">35</xref>], the 2023 Japanese Nursing Examination [<xref ref-type="bibr" rid="ref36">36</xref>], the Peruvian National Licensing Medical Examination (Examen Nacional de Medicina) [<xref ref-type="bibr" rid="ref37">37</xref>], and in the USMLE soft skill assessments [<xref ref-type="bibr" rid="ref28">28</xref>], to name a few. Nonetheless, ChatGPT-4 is a paid model and thus not accessible to everyone, which is not the case for the most recent free-to-use ChatGPT-4o mini.</p><p>Another important aspect is consistency. The results of this study revealed that ChatGPT-3.5 Turbo was less stable when asked to confirm its original answers. These results are consistent with those of Brin et al [<xref ref-type="bibr" rid="ref28">28</xref>], who found that ChatGPT-3.5 altered its answers 82.5% of the time in the USMLE assessments [<xref ref-type="bibr" rid="ref28">28</xref>]. Unfortunately, in this study, it was not shown that by changing the original answers, ChatGPT-3.5 Turbo improves its accuracy. This contrasts with studies on human students, which have shown that changing their answers usually improves their test scores [<xref ref-type="bibr" rid="ref38">38</xref>]. One can wonder, since the &#x201C;awareness of what one knows and does not know depends in part on how much one knows&#x201D; [<xref ref-type="bibr" rid="ref39">39</xref>], does ChatGPT-3.5 Turbo change its answers because it does not know, or does it simply change answers to satisfy the user when prompted?</p><p>When evaluating the AI models against human respondents, it was found that in part 2 of the PNA examination (74 questions resolved by ChatGPT-4o mini plus 1 question by GPT-4o), the LLM outperformed the average accuracy of human participants. In contrast, in part 1 of the PNA examination (74 questions resolved by ChatGPT-3.5 Turbo plus 1 question by GPT-4o), LLM showed lower accuracy than human respondents. This indicates that while earlier versions, like ChatGPT-3.5 Turbo, may have required a high degree of human oversight, more recent and advanced versions, like ChatGPT-4o mini, have the potential to match or exceed human performance in medical domains. Although no previous studies have analyzed the performance of ChatGPT-4o mini, and no direct comparisons can be made, some studies have already noted that LLMs outperformed human candidates in several medical examinationinations (eg, the German Medical State Examinations of 2022 [<xref ref-type="bibr" rid="ref40">40</xref>], part 1 of the Fellowship of the Royal College of Ophthalmologists MCQ examination [<xref ref-type="bibr" rid="ref41">41</xref>], and the University of Toronto Family Medicine Residency Progress Test [<xref ref-type="bibr" rid="ref42">42</xref>]).</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations regarding the performance evaluation of ChatGPT-3.5 Turbo and ChatGPT-4o mini. The analysis was based solely on ChatGPT&#x2019;s indication of the correct answer, which, while aligning with expectations for human candidates, does not consider other aspects of examination performance. Additionally, the grading did not account for the complexity or length of the questions, providing an incomplete assessment of the models&#x2019; performance. Further studies should incorporate a more comprehensive evaluation framework that considers the reasoning process and evaluates performance across a broader range of question types and difficulties.</p></sec><sec id="s4-5"><title>Future Perspectives</title><p>This study highlights the importance of continuous improvement in ChatGPT models to further enhance their reliability and accuracy. The superior performance of ChatGPT-4o mini compared to its predecessor offers promising applications in medical education. Its higher accuracy and consistency suggest that it could serve as an effective tool for training medical students. However, a broader assessment of ChatGPT-4o mini across various tests and real-world scenarios is required, as good performance on a specific test may not indicate abilities for general and reliable medical education usage. Additionally, there are known drawbacks and ethical considerations when using AI applications, including the potential for fabricated, incorrect, or biased information [<xref ref-type="bibr" rid="ref43">43</xref>]. Other issues include limited training periods and the possibility of providing different answers to the same question depending on how the question is phrased [<xref ref-type="bibr" rid="ref43">43</xref>]. A recent systematic scoping review by Xu et al [<xref ref-type="bibr" rid="ref44">44</xref>] advises medical students to use ChatGPT cautiously, cross-checking information with reliable sources and disclosing AI-generated content in their work. Teachers should guide students on the effective and ethical use of ChatGPT, assess its reliability, and explore mixed assessment methods to evaluate student abilities while considering its impact on traditional assignments [<xref ref-type="bibr" rid="ref44">44</xref>].</p></sec><sec id="s4-6"><title>Conclusion</title><p>On the 2023 Portuguese National Examination for Access to Specialized Training, ChatGPT-4o mini achieved an accuracy rate of 65% (48/74), surpassing ChatGPT-3.5 Turbo. This demonstrates a superior capability in handling medical questions. ChatGPT-4o mini outperformed medical candidates, while ChatGPT-3.5 Turbo had a more moderate performance. This study highlights the advancements and potential of ChatGPT models in medical education, emphasizing the importance of careful implementation with teacher oversight and further research.</p></sec></sec></body><back><ack><p>This study was supported by national funds through FCT (Funda&#x00E7;&#x00E3;o para a Ci&#x00EA;ncia e a Tecnologia, I.P.) within CINTESIS R&#x0026;D Unit (reference UIDB/4255/2020) and within the scope of the project RISE, Associated Laboratory (reference LA/P/0053/2020). During the preparation of this manuscript, the author used ChatGPT-4o mini in order to improve the language of the manuscript and correct grammatical errors. After using this tool, the author reviewed and edited the content as needed and takes full responsibility for the content of the publication.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MCQ</term><def><p>multiple-choice question</p></def></def-item><def-item><term id="abb4">PNA</term><def><p>Prova Nacional de Acesso &#x00E0; Forma&#x00E7;&#x00E3;o Especializada</p></def></def-item><def-item><term id="abb5">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feuerriegel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hartmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Janiesch</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zschech</surname><given-names>P</given-names> </name></person-group><article-title>Generative AI</article-title><source>Bus Inf Syst Eng</source><year>2024</year><month>02</month><volume>66</volume><issue>1</issue><fpage>111</fpage><lpage>126</lpage><pub-id pub-id-type="doi">10.1007/s12599-023-00834-7</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ramdurai</surname><given-names>B</given-names> </name><name name-style="western"><surname>Adhithya</surname><given-names>P</given-names> </name></person-group><article-title>The impact, advancements and applications of generative AI</article-title><source>Int J Comput Sci Eng</source><year>2023</year><volume>10</volume><issue>6</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.14445/23488387/IJCSE-V10I6P101</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A comprehensive survey of AI-generated content (AIGC): a history of generative AI from GAN to ChatGPT</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 7, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.04226</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><article-title>Introducing ChatGPT</article-title><source>OpenAI</source><year>2022</year><month>11</month><day>30</day><access-date>2024-07-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/chatgpt">https://openai.com/index/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Conte</surname><given-names>N</given-names> </name></person-group><article-title>Ranked: the most popular AI tools</article-title><source>Visual Capitalist</source><year>2024</year><month>01</month><day>24</day><access-date>2025-02-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.visualcapitalist.com/ranked-the-most-popular-ai-tools">https://www.visualcapitalist.com/ranked-the-most-popular-ai-tools</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sanders</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>ChatGPT: promise and challenges for deployment in low- and middle-income countries</article-title><source>Lancet Reg Health West Pac</source><year>2023</year><month>12</month><volume>41</volume><fpage>100905</fpage><pub-id pub-id-type="doi">10.1016/j.lanwpc.2023.100905</pub-id><pub-id pub-id-type="medline">37731897</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aydin</surname><given-names>&#x00D6;</given-names> </name><name name-style="western"><surname>Karaarslan</surname><given-names>E</given-names> </name></person-group><article-title>Is ChatGPT leading generative AI? What is beyond expectations?</article-title><source>Acad Platform J Eng Smart Sys</source><year>2023</year><volume>11</volume><issue>3</issue><fpage>118</fpage><lpage>134</lpage><pub-id pub-id-type="doi">10.21541/apjess.1293702</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>BuHamdan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alwisy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bouferguene</surname><given-names>A</given-names> </name></person-group><article-title>Generative systems in the architecture, engineering and construction industry: a systematic review and analysis</article-title><source>Int J Archit Comput</source><year>2021</year><month>09</month><volume>19</volume><issue>3</issue><fpage>226</fpage><lpage>249</lpage><pub-id pub-id-type="doi">10.1177/1478077120934126</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>O</given-names> </name><name name-style="western"><surname>Abdelbaki</surname><given-names>W</given-names> </name><name name-style="western"><surname>Shrestha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Elbasi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Alryalat</surname><given-names>MAA</given-names> </name><name name-style="western"><surname>Dwivedi</surname><given-names>YK</given-names> </name></person-group><article-title>A systematic literature review of artificial intelligence in the healthcare sector: benefits, challenges, methodologies, and functionalities</article-title><source>J Innov Knowl</source><year>2023</year><month>01</month><volume>8</volume><issue>1</issue><fpage>100333</fpage><pub-id pub-id-type="doi">10.1016/j.jik.2023.100333</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Younis</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Eisa</surname><given-names>TAE</given-names> </name><name name-style="western"><surname>Nasser</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A systematic review and meta-analysis of artificial intelligence tools in medicine and healthcare: applications, considerations, limitations, motivation and challenges</article-title><source>Diagnostics (Basel)</source><year>2024</year><month>01</month><day>4</day><volume>14</volume><issue>1</issue><fpage>109</fpage><pub-id pub-id-type="doi">10.3390/diagnostics14010109</pub-id><pub-id pub-id-type="medline">38201418</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruksakulpiwat</surname><given-names>S</given-names> </name><name name-style="western"><surname>Thorngthip</surname><given-names>S</given-names> </name><name name-style="western"><surname>Niyomyart</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A systematic review of the application of artificial intelligence in nursing care: where are we, and what&#x2019;s next?</article-title><source>J Multidiscip Healthc</source><year>2024</year><volume>17</volume><fpage>1603</fpage><lpage>1616</lpage><pub-id pub-id-type="doi">10.2147/JMDH.S459946</pub-id><pub-id pub-id-type="medline">38628616</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bitkina</surname><given-names>OV</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HK</given-names> </name></person-group><article-title>Application of artificial intelligence in medical technologies: a systematic review of main trends</article-title><source>Digit Health</source><year>2023</year><volume>9</volume><pub-id pub-id-type="doi">10.1177/20552076231189331</pub-id><pub-id pub-id-type="medline">37485326</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedaghat</surname><given-names>S</given-names> </name></person-group><article-title>Early applications of ChatGPT in medical practice, education and research</article-title><source>Clin Med (Lond)</source><year>2023</year><month>05</month><volume>23</volume><issue>3</issue><fpage>278</fpage><lpage>279</lpage><pub-id pub-id-type="doi">10.7861/clinmed.2023-0078</pub-id><pub-id pub-id-type="medline">37085182</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gandomani</surname><given-names>HS</given-names> </name></person-group><article-title>ChatGPT in medical education: how we can use in medical education: challenges and opportunities</article-title><source>J Multidiscip Care</source><year>2023</year><volume>12</volume><issue>1</issue><fpage>1</fpage><lpage>2</lpage><pub-id pub-id-type="doi">10.34172/jmdc.1232</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sani</surname><given-names>I</given-names> </name></person-group><article-title>Enhancing medical education with ChatGPT: a promising tool for the future</article-title><source>Can J Med</source><year>2024</year><month>04</month><day>1</day><volume>6</volume><issue>1</issue><fpage>1</fpage><lpage>4</lpage><pub-id pub-id-type="doi">10.33844/cjm.2024.6032</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hosseini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Liebovitz</surname><given-names>DM</given-names> </name><etal/></person-group><article-title>An exploratory survey about using ChatGPT in education, healthcare, and research</article-title><source>PLoS One</source><year>2023</year><volume>18</volume><issue>10</issue><fpage>e0292216</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0292216</pub-id><pub-id pub-id-type="medline">37796786</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name></person-group><article-title>Harnessing the power of ChatGPT in medical education</article-title><source>Med Teach</source><year>2023</year><month>09</month><volume>45</volume><issue>9</issue><fpage>1063</fpage><pub-id pub-id-type="doi">10.1080/0142159X.2023.2198094</pub-id><pub-id pub-id-type="medline">37036161</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeblick</surname><given-names>K</given-names> </name><name name-style="western"><surname>Schachtner</surname><given-names>B</given-names> </name><name name-style="western"><surname>Dexl</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title><source>Eur Radiol</source><year>2024</year><month>05</month><volume>34</volume><issue>5</issue><fpage>2817</fpage><lpage>2825</lpage><pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id><pub-id pub-id-type="medline">37794249</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheung</surname><given-names>BHH</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>GKK</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>GTC</given-names> </name><etal/></person-group><article-title>ChatGPT versus human in generating medical graduate exam multiple choice questions&#x2014;a multinational prospective study (Hong Kong S.A.R., Singapore, Ireland, and the United Kingdom)</article-title><source>PLoS One</source><year>2023</year><volume>18</volume><issue>8</issue><fpage>e0290691</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0290691</pub-id><pub-id pub-id-type="medline">37643186</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Horesh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Brezinov</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>R</given-names> </name></person-group><article-title>Performance of ChatGPT in medical examinations: a systematic review and a meta-analysis</article-title><source>BJOG</source><year>2024</year><month>02</month><volume>131</volume><issue>3</issue><fpage>378</fpage><lpage>380</lpage><pub-id pub-id-type="doi">10.1111/1471-0528.17641</pub-id><pub-id pub-id-type="medline">37604703</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sumbal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sumbal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Amir</surname><given-names>A</given-names> </name></person-group><article-title>Can ChatGPT-3.5 pass a medical exam? A systematic review of ChatGPT&#x2019;s performance in academic testing</article-title><source>J Med Educ Curric Dev</source><year>2024</year><volume>11</volume><pub-id pub-id-type="doi">10.1177/23821205241238641</pub-id><pub-id pub-id-type="medline">38487300</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cherif</surname><given-names>H</given-names> </name><name name-style="western"><surname>Moussa</surname><given-names>C</given-names> </name><name name-style="western"><surname>Missaoui</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Salouage</surname><given-names>I</given-names> </name><name name-style="western"><surname>Mokaddem</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dhahri</surname><given-names>B</given-names> </name></person-group><article-title>Appraisal of ChatGPT&#x2019;s aptitude for medical education: comparative analysis with third-year medical students in a pulmonology examination</article-title><source>JMIR Med Educ</source><year>2024</year><month>07</month><day>23</day><volume>10</volume><fpage>e52818</fpage><pub-id pub-id-type="doi">10.2196/52818</pub-id><pub-id pub-id-type="medline">39042876</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>G</given-names> </name><etal/></person-group><article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title><source>J Med Syst</source><year>2023</year><month>08</month><day>15</day><volume>47</volume><issue>1</issue><fpage>86</fpage><pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id><pub-id pub-id-type="medline">37581690</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alfertshofer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Funk</surname><given-names>PF</given-names> </name><etal/></person-group><article-title>Sailing the seven seas: a multinational comparison of ChatGPT&#x2019;s performance on medical licensing examinations</article-title><source>Ann Biomed Eng</source><year>2024</year><month>06</month><volume>52</volume><issue>6</issue><fpage>1542</fpage><lpage>1545</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03338-3</pub-id><pub-id pub-id-type="medline">37553555</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Prova Nacional de Acesso &#x00E0; Forma&#x00E7;&#x00E3;o Especializada 2023 Perguntas Frequentes</article-title><source>ACSS</source><year>2023</year><access-date>2024-08-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.acss.min-saude.pt/wp-content/uploads/2018/09/FAQ_PNA2023.pdf">https://www.acss.min-saude.pt/wp-content/uploads/2018/09/FAQ_PNA2023.pdf</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title><source>Sci Rep</source><year>2023</year><month>10</month><day>1</day><volume>13</volume><issue>1</issue><fpage>16492</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id><pub-id pub-id-type="medline">37779171</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McNemar</surname><given-names>Q</given-names> </name></person-group><article-title>Note on the sampling error of the difference between correlated proportions or percentages</article-title><source>Psychometrika</source><year>1947</year><month>06</month><volume>12</volume><issue>2</issue><fpage>153</fpage><lpage>157</lpage><pub-id pub-id-type="doi">10.1007/BF02295996</pub-id><pub-id pub-id-type="medline">20254758</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Edwards</surname><given-names>AL</given-names> </name></person-group><article-title>Note on the correction for continuity in testing the significance of the difference between correlated proportions</article-title><source>Psychometrika</source><year>1948</year><month>09</month><volume>13</volume><issue>3</issue><fpage>185</fpage><lpage>187</lpage><pub-id pub-id-type="doi">10.1007/BF02289261</pub-id><pub-id pub-id-type="medline">18885738</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oztermeli</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Oztermeli</surname><given-names>A</given-names> </name></person-group><article-title>ChatGPT performance in the medical specialty exam: an observational study</article-title><source>Medicine (Baltimore)</source><year>2023</year><month>08</month><day>11</day><volume>102</volume><issue>32</issue><fpage>e34673</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000034673</pub-id><pub-id pub-id-type="medline">37565917</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ebrahimian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Behnam</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ghayebi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sobhrakhshankhah</surname><given-names>E</given-names> </name></person-group><article-title>ChatGPT in Iranian medical licensing examination: evaluating the diagnostic accuracy and decision-making capabilities of an AI-based model</article-title><source>BMJ Health Care Inform</source><year>2023</year><month>12</month><day>11</day><volume>30</volume><issue>1</issue><fpage>e100815</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2023-100815</pub-id><pub-id pub-id-type="medline">38081765</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>W&#x00F3;jcik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rulkiewicz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pruszczyk</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lisik</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pobo&#x017C;y</surname><given-names>M</given-names> </name><name name-style="western"><surname>Domienik-Kar&#x0142;owicz</surname><given-names>J</given-names> </name></person-group><article-title>Reshaping medical education: performance of ChatGPT on a PES medical examination</article-title><source>Cardiol J</source><year>2024</year><volume>31</volume><issue>3</issue><fpage>442</fpage><lpage>450</lpage><pub-id pub-id-type="doi">10.5603/cj.97517</pub-id><pub-id pub-id-type="medline">37830257</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roso&#x0142;</surname><given-names>M</given-names> </name><name name-style="western"><surname>G&#x0105;sior</surname><given-names>JS</given-names> </name><name name-style="western"><surname>&#x0141;aba</surname><given-names>J</given-names> </name><name name-style="western"><surname>Korzeniewski</surname><given-names>K</given-names> </name><name name-style="western"><surname>M&#x0142;y&#x0144;czak</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish Medical Final Examination</article-title><source>Sci Rep</source><year>2023</year><month>11</month><day>22</day><volume>13</volume><issue>1</issue><fpage>20512</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-46995-z</pub-id><pub-id pub-id-type="medline">37993519</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guillen-Grima</surname><given-names>F</given-names> </name><name name-style="western"><surname>Guillen-Aguinaga</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guillen-Aguinaga</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating the efficacy of ChatGPT in navigating the Spanish Medical Residency Entrance Examination (MIR): promising horizons for AI in clinical medicine</article-title><source>Clin Pract</source><year>2023</year><month>11</month><day>20</day><volume>13</volume><issue>6</issue><fpage>1460</fpage><lpage>1487</lpage><pub-id pub-id-type="doi">10.3390/clinpract13060130</pub-id><pub-id pub-id-type="medline">37987431</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaneda</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Takahashi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kaneda</surname><given-names>U</given-names> </name><etal/></person-group><article-title>Assessing the performance of GPT-3.5 and GPT-4 on the 2023 Japanese nursing examination</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e42924</fpage><pub-id pub-id-type="doi">10.7759/cureus.42924</pub-id><pub-id pub-id-type="medline">37667724</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flores-Cohaila</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Garc&#x00ED;a-Vicente</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vizcarra-Jim&#x00E9;nez</surname><given-names>SF</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on the Peruvian National Licensing Medical Examination: cross-sectional study</article-title><source>JMIR Med Educ</source><year>2023</year><month>09</month><day>28</day><volume>9</volume><fpage>e48039</fpage><pub-id pub-id-type="doi">10.2196/48039</pub-id><pub-id pub-id-type="medline">37768724</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kruger</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wirtz</surname><given-names>D</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>DT</given-names> </name></person-group><article-title>Counterfactual thinking and the first instinct fallacy</article-title><source>J Pers Soc Psychol</source><year>2005</year><month>05</month><volume>88</volume><issue>5</issue><fpage>725</fpage><lpage>735</lpage><pub-id pub-id-type="doi">10.1037/0022-3514.88.5.725</pub-id><pub-id pub-id-type="medline">15898871</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coutinho</surname><given-names>MVC</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fredricks-Lowman</surname><given-names>I</given-names> </name><name name-style="western"><surname>Alkaabi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Couchman</surname><given-names>JJ</given-names> </name></person-group><article-title>Unskilled and unaware: second-order judgments increase with miscalibration for low performers</article-title><source>Front Psychol</source><year>2024</year><volume>15</volume><fpage>1252520</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2024.1252520</pub-id><pub-id pub-id-type="medline">38952836</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kasapovic</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jansen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name></person-group><article-title>Artificial intelligence in medical education: comparative analysis of ChatGPT, Bing, and medical students in Germany</article-title><source>JMIR Med Educ</source><year>2023</year><month>09</month><day>4</day><volume>9</volume><fpage>e46482</fpage><pub-id pub-id-type="doi">10.2196/46482</pub-id><pub-id pub-id-type="medline">37665620</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fowler</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pullen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Birkett</surname><given-names>L</given-names> </name></person-group><article-title>Performance of ChatGPT and Bard on the official part 1 FRCOphth practice questions</article-title><source>Br J Ophthalmol</source><year>2024</year><month>10</month><volume>108</volume><issue>10</issue><fpage>1379</fpage><lpage>1383</lpage><pub-id pub-id-type="doi">10.1136/bjo-2023-324091</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>KJQ</given-names> </name><name name-style="western"><surname>Meaney</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kemppainen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Punnett</surname><given-names>A</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>FH</given-names> </name></person-group><article-title>Assessment of resident and AI chatbot performance on the University of Toronto Family Medicine Residency Progress Test: comparative study</article-title><source>JMIR Med Educ</source><year>2023</year><month>09</month><day>19</day><volume>9</volume><fpage>e50514</fpage><pub-id pub-id-type="doi">10.2196/50514</pub-id><pub-id pub-id-type="medline">37725411</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>RSY</given-names> </name><name name-style="western"><surname>Ming</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Raja Ali</surname><given-names>RA</given-names> </name></person-group><article-title>The intersection of ChatGPT, clinical medicine, and medical education</article-title><source>JMIR Med Educ</source><year>2023</year><month>11</month><day>21</day><volume>9</volume><fpage>e47274</fpage><pub-id pub-id-type="doi">10.2196/47274</pub-id><pub-id pub-id-type="medline">37988149</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name></person-group><article-title>Opportunities, challenges, and future directions of large language models, including ChatGPT in medical education: a systematic scoping review</article-title><source>J Educ Eval Health Prof</source><year>2024</year><volume>21</volume><issue>6</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.3352/jeehp.2024.21.6</pub-id><pub-id pub-id-type="medline">38486402</pub-id></nlm-citation></ref></ref-list></back></article>