@Article{info:doi/10.2196/65108,
author="Prazeres, Filipe",
title="ChatGPT's Performance on Portuguese Medical Examination Questions: Comparative Analysis of ChatGPT-3.5 Turbo and ChatGPT-4o Mini",
journal="JMIR Med Educ",
year="2025",
month="Mar",
day="5",
volume="11",
pages="e65108",
keywords="ChatGPT-3.5 Turbo; ChatGPT-4o mini; medical examination; European Portuguese; AI performance evaluation; Portuguese; evaluation; medical examination questions; examination question; chatbot; ChatGPT; model; artificial intelligence; AI; GPT; LLM; NLP; natural language processing; machine learning; large language model",
abstract="Background: Advancements in ChatGPT are transforming medical education by providing new tools for assessment and learning, potentially enhancing evaluations for doctors and improving instructional effectiveness. Objective: This study evaluates the performance and consistency of ChatGPT-3.5 Turbo and ChatGPT-4o mini in solving European Portuguese medical examination questions (2023 National Examination for Access to Specialized Training; Prova Nacional de Acesso {\`a} Forma{\c{c}}{\~a}o Especializada [PNA]) and compares their performance to human candidates. Methods: ChatGPT-3.5 Turbo was tested on the first part of the examination (74 questions) on July 18, 2024, and ChatGPT-4o mini on the second part (74 questions) on July 19, 2024. Each model generated an answer using its natural language processing capabilities. To test consistency, each model was asked, ``Are you sure?'' after providing an answer. Differences between the first and second responses of each model were analyzed using the McNemar test with continuity correction. A single-parameter t test compared the models' performance to human candidates. Frequencies and percentages were used for categorical variables, and means and CIs for numerical variables. Statistical significance was set at P<.05. Results: ChatGPT-4o mini achieved an accuracy rate of 65{\%} (48/74) on the 2023 PNA examination, surpassing ChatGPT-3.5 Turbo. ChatGPT-4o mini outperformed medical candidates, while ChatGPT-3.5 Turbo had a more moderate performance. Conclusions: This study highlights the advancements and potential of ChatGPT models in medical education, emphasizing the need for careful implementation with teacher oversight and further research. ",
issn="2369-3762",
doi="10.2196/65108",
url="https://mededu.jmir.org/2025/1/e65108",
url="https://doi.org/10.2196/65108"
}