<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e78320</article-id><article-id pub-id-type="doi">10.2196/78320</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Impact of Prompt Engineering on the Performance of ChatGPT Variants Across Different Question Types in Medical Student Examinations: Cross-Sectional Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Hsieh</surname><given-names>Ming-Yu</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Tzu-Ling</given-names></name><degrees>MSc, RNC</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Su</surname><given-names>Pen-Hua</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Chou</surname><given-names>Ming-Chih</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Pediatric Surgery, Department of Surgery, Chung Shan Medical University Hospital</institution><addr-line>Taichung City</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>Institute of Medicine, School of Medicine, Chung Shan Medical University</institution><addr-line>No. 110, Sec. 2, Jiang Kuo South Road, South district</addr-line><addr-line>Taichung City</addr-line><country>Taiwan</country></aff><aff id="aff3"><institution>Department of Nursing, School of Medicine, Chung Shan Medical University</institution><addr-line>Taichung City</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>Department of Pediatrics, Chung Shan Medical University Hospital</institution><addr-line>Taichung City</addr-line><country>Taiwan</country></aff><aff id="aff5"><institution>Division of Thoracic Surgery, Department of Surgery, Chung Shan Medical University Hospital</institution><addr-line>Taichung City</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sian</surname><given-names>T</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Raut</surname><given-names>Ganesh</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Tsai</surname><given-names>Hsinlin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Ming-Chih Chou, MD, PhD, Institute of Medicine, School of Medicine, Chung Shan Medical University, No. 110, Sec. 2, Jiang Kuo South Road, South district, Taichung City, Taiwan, 886 4-24739595 ext 34601; <email>zhou.mingzhi.csmu@gmail.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>1</day><month>10</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e78320</elocation-id><history><date date-type="received"><day>30</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>06</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Ming Yu Hsieh, Tzu-Ling Wang, Pen-Hua Su, Ming-Chih Chou. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 1.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e78320"/><abstract><sec><title>Background</title><p>Large language models such as ChatGPT (OpenAI) have shown promise in medical education assessments, but the comparative effects of prompt engineering across optimized variants and relative performance against medical students remain unclear.</p></sec><sec><title>Objective</title><p>This study aims to systematically evaluate the impact of prompt engineering on five ChatGPT variants (GPT-3.5, GPT-4.0, GPT-4o, GPT-4o1-mini, and GPT-4o1) and benchmark their performance against fourth-year medical students in midterm and final examinations.</p></sec><sec sec-type="methods"><title>Methods</title><p>A 100-item examination dataset covering multiple choice questions, short answer questions, clinical case analysis, and image-based questions was administered to each model under no-prompt and prompt-engineering conditions over 5 independent runs. Student cohort scores (N=143) were collected for comparison. Responses were scored using standardized rubrics, converted to percentages, and analyzed in SPSS Statistics (v29.0) with paired <italic>t</italic> tests and Cohen <italic>d</italic> (<italic>P</italic>&#x003C;.05).</p></sec><sec sec-type="results"><title>Results</title><p>Baseline midterm scores ranged from 59.2% (GPT-3.5) to 94.1% (GPT-4o1), and final scores ranged from 55% to 92.4%. Fourth-year students averaged 89.4% (midterm) and 80.2% (final). Prompt engineering significantly improved GPT-3.5 (10.6%, <italic>P</italic>&#x003C;.001) and GPT-4.0 (3.2%, <italic>P</italic>=.002) but yielded negligible gains for optimized variants (<italic>P</italic>=.07&#x2010;.94). Optimized models matched or exceeded student performance on both exams.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Prompt engineering enhances early-generation model performance, whereas advanced variants inherently achieve near-ceiling accuracy, surpassing medical students. As large language models mature, emphasis should shift from prompt design to model selection, multimodal integration, and critical use of artificial intelligence as a learning companion.</p></sec></abstract><kwd-group><kwd>ChatGPT</kwd><kwd>prompt engineering</kwd><kwd>medical education</kwd><kwd>large language models</kwd><kwd>assessment performance</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The integration of large language models (LLMs), such as OpenAI&#x2019;s ChatGPT series, into medical education has generated considerable interest due to their potential for automating and enhancing assessment processes [<xref ref-type="bibr" rid="ref1">1</xref>]. Initial investigations focused on foundational models&#x2014;ChatGPT 3.5 and ChatGPT 4.0&#x2014;to benchmark baseline performance in answering clinical and basic science questions representative of medical student examinations [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. These early studies demonstrated that, compared to ChatGPT 3.5, ChatGPT 4.0 exhibited significant improvements in overall accuracy, reasoning ability, and contextual understanding, particularly in complex scenario-based items [<xref ref-type="bibr" rid="ref3">3</xref>]. Prompt engineering&#x2014;providing structured guidance within the input prompt&#x2014;was shown to further augment performance for both models, yielding notable score increases and reducing variance across repeated trials [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Since the release of ChatGPT 4.0, OpenAI has introduced enhanced variants optimized for performance and generalization: GPT-4o (optimized for multimodal tasks), GPT-4o1-mini (a compact, latency-reduced iteration), and GPT-4o1 (the full-capacity optimized version) [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. However, the comparative effects of prompt engineering on these advanced models remain unexplored. Given their architectural refinements and improved instruction-following capabilities, it is plausible that newer variants may naturally internalize prompt structures, diminishing the incremental benefit of explicit guidance.</p><p>This study expands upon prior work by systematically evaluating the impact of prompt engineering across 5 ChatGPT variants&#x2014;3.5, 4.0, 4o, 4o1mini, and 4o1&#x2014;using a robust examination framework comprising multiple question types (multiple choice questions, MCQs; short answer questions, SAQs; clinical case analysis, CCA; and image-based interpretation, IBI). By comparing performance with and without structured prompts, we aim to quantify the degree to which prompt dependency has evolved alongside model iterations. The findings will inform best practices for leveraging LLMs in high-stakes medical education settings and contribute to understanding the maturation of prompt engineering as a methodology.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Model Selection</title><p>This cross-sectional evaluation compared the performance of 5 OpenAI GPT variants: ChatGPT 3.5 (GPT-3.5-turbo), ChatGPT 4.0 (GPT-4), GPT-4o (optimized for multimodal tasks), GPT-4o1-mini (compact and latency-reduced), and GPT-4o1 (full-capacity optimized). Each variant was assessed under 2 prompting conditions: without a structured prompt (N) and with prompt engineering (P) (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of the study. Midterm and final gastrointestinal-module exams (100 items each) were administered to 143 fourth-year medical students and to 5 ChatGPT variants (GPT-3.5, GPT-4, GPT-4o, GPT-4o1-mini, and GPT-4o1) under 2 prompting conditions (no prompt vs prompt engineering). Each model or condition combination was run 5 times, responses were scored against the official key by 2 blinded reviewers, and performance metrics (mean, SD, <italic>P</italic> values, and effect sizes) as well as exam item difficulty and discrimination were analyzed.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e78320_fig01.png"/></fig></sec><sec id="s2-2"><title>Selecting the Student Cohort</title><p>Our curriculum uses modular teaching from the second semester of year 3 through the first semester of year 4. We selected fourth-year medical students to ensure participants had completed the modular clinical instruction, avoiding the variability introduced by students newly exposed to clinical modules. This choice ensures that the student cohort had comparable training before exam administration.</p></sec><sec id="s2-3"><title>Examination Dataset and Question Types</title><p>We curated a 100-item question set drawn from the official medical student midterm and final examinations administered at Chung Shan Medical University in the 2024&#x2010;2025 academic year. The items encompassed 4 question types:</p><list list-type="bullet"><list-item><p>MCQs: single-best-answer format (n=40)</p></list-item><list-item><p>SAQs: 1&#x2010;2 sentence responses (n=20)</p></list-item><list-item><p>CCA: open-ended diagnostic and management scenarios (n=20)</p></list-item><list-item><p>IBI: radiographic or histologic images requiring identification or explanation (n=20)</p></list-item></list></sec><sec id="s2-4"><title>Prompt Engineering</title><p>For the P condition, each item was prefaced with a standardized instruction prompt:</p><p>&#x201C;You are an expert medical educator. Answer the following question concisely and justify your reasoning step by step.&#x201D;</p><p>In the N condition, only the question stem was presented.</p></sec><sec id="s2-5"><title>Evaluation Procedure</title><p>For each model and condition, we conducted 5 independent runs (rounds) to account for stochastic variations. In each run, the model received the full 100-item set sequentially via the OpenAI API (v1) with default temperature settings (temperature=0.7, max_tokens=512). Outputs were recorded and collated.</p></sec><sec id="s2-6"><title>Scoring and Outcome Measures</title><p>Responses were scored against the official answer key by 2 independent reviewers blinded to model and condition. MCQs were scored dichotomously (1 point for correct answer, otherwise 0). SAQs and IBI were scored on a 0&#x2010;2 scale (0=incorrect or missing, 1=partially correct, and 2=fully correct). CCA responses were scored on a 0&#x2010;3 rubric evaluating diagnostic accuracy, management plan, and justification. Total raw scores were converted to percentages of maximum possible scores.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Statistical analyses were conducted in SPSS Statistics (version 29, IBM Corp). For each model and condition, descriptive statistics (mean scores and SD) were derived using the frequencies and descriptives procedures. Paired 2-tailed <italic>t</italic> tests (paired-samples <italic>t</italic> test) compared scores between the no-prompt (N) and prompt-engineering (P) conditions within each variant. Effect sizes (Cohen <italic>d</italic>) were calculated based on mean differences and pooled SDs. Statistical significance was set at <italic>P</italic>&#x003C;.05, and all tests were 2-sided.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>The study protocol was reviewed and approved by the Institutional Review Board (IRB) of Chung Shan Medical University Hospital (approval number CSMU-2024-075), in accordance with institutional policies and the Declaration of Helsinki. The IRB granted a waiver of written informed consent because the research analyzed routinely collected deidentified examination records, involved no direct interaction or intervention with students, and posed minimal risk. Before transfer to the study team, all records were deidentified by the medical school; no direct identifiers (eg, names, student IDs, email, and IP addresses) were accessed. Analyses were performed on files labeled with random study codes, and only aggregate results are reported. Data were stored on password-protected institutional servers with access restricted to the research team and will be retained per institutional policy; no individual-level raw data will be publicly shared. Participants received no compensation, and inclusion in the dataset had no impact on grades, course standing, or academic evaluation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overall Performance on Midterm and Final Examinations</title><p>As summarized in <xref ref-type="table" rid="table1">Table 1</xref>, baseline performance of ChatGPT 3.5 on the midterm examination was 59.2% (SD 2.1), whereas ChatGPT 4.0 achieved 81.4% (SD 1.8). The optimized variants&#x2014;GPT-4o, GPT-4o1-mini, and GPT-4o1&#x2014;further improved mean midterm scores to 91.3% (SD 0.8), 86.1% (SD 1), and 94.1% (SD 0.5), respectively (<xref ref-type="table" rid="table1">Table 1</xref>). A similar trend was observed for the final examination (<xref ref-type="table" rid="table1">Table 1</xref>), where GPT-3.5 scored 55% (SD 2.4) and GPT-4.0 scored 84.2% (SD 1.7), with GPT-4o, GPT-4o1-mini, and GPT-4o1 achieving 90.6% (SD 0.9), 82.1% (SD 0.6), and 92.4% (SD 0.6), respectively.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Basic information of the exams: overall performance of ChatGPT variants on midterm and final examinations. Mean percentage scores (SD) for 5 GPT models (GPT-3.5, GPT-4.0, GPT-4o, GPT-4o1-mini, and GPT-4o1) under no-prompt and prompt-engineering conditions are listed, illustrating baseline accuracy and comparative gains across both examinations.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Exams</td><td align="left" valign="bottom">Midterm exams</td><td align="left" valign="bottom">Final exams</td></tr></thead><tbody><tr><td align="left" valign="top">Total questions, N</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">&#x2003;Overall discrimination, mean (SD)</td><td align="left" valign="top">0.25 (0.18)</td><td align="left" valign="top">0.32 (0.21)</td></tr><tr><td align="left" valign="top">&#x2003;Overall difficulty level, mean (SD)</td><td align="left" valign="top">0.82 (0.14)</td><td align="left" valign="top">0.72 (0.18)</td></tr><tr><td align="left" valign="top">Memorization questions, n</td><td align="left" valign="top">66</td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top">&#x2003;Discrimination, mean (SD)</td><td align="left" valign="top">0.27 (0.16)</td><td align="left" valign="top">0.34 (0.20)</td></tr><tr><td align="left" valign="top">&#x2003;Difficulty level, mean (SD)</td><td align="left" valign="top">0.84 (0.12)</td><td align="left" valign="top">0.74 (0.15)</td></tr><tr><td align="left" valign="top">Application questions, n</td><td align="left" valign="top">34</td><td align="left" valign="top">37</td></tr><tr><td align="left" valign="top">&#x2003;Discrimination, mean (SD)</td><td align="left" valign="top">0.21 (0.20)</td><td align="left" valign="top">0.29 (0.22)</td></tr><tr><td align="left" valign="top">&#x2003;Difficulty level, mean (SD)</td><td align="left" valign="top">0.78 (0.16)</td><td align="left" valign="top">0.69 (0.21)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Comparison With Medical Student Performance</title><p>A cohort of 143 fourth-year medical students took the identical midterm and final examinations, achieving a mean midterm score of 89.4% (SD 7.13) and a mean final score of 80.2% (SD 8.73) (<xref ref-type="table" rid="table2">Table 2</xref>). GPT-3.5 underperformed relative to students (59.2% vs 89.4%, <italic>P</italic>&#x003C;.001; 55% vs 80.2%, <italic>P</italic>&#x003C;.001), whereas advanced variants such as GPT-4o1 matched or exceeded student performance on both the midterm (94.1% vs 89.4%, <italic>P</italic>&#x003C;.001) and final exams (92.4% vs 80.2%, <italic>P</italic>&#x003C;.001) (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>GPTs&#x2019; performance in different question types.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ChatGPT versions</td><td align="left" valign="bottom">GPT-3.5N</td><td align="left" valign="bottom">GPT-3.5P</td><td align="left" valign="bottom">GPT-4N</td><td align="left" valign="bottom">GPT-4P</td><td align="left" valign="bottom">GPT-4oN</td><td align="left" valign="bottom">GPT-4oP</td><td align="left" valign="bottom">GPT-o1miniN</td><td align="left" valign="bottom">GPT-o1miniP</td><td align="left" valign="bottom">GPT-o1N</td><td align="left" valign="bottom">GPT-o1P</td><td align="left" valign="bottom">Students</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="12">Midterm exams</td></tr><tr><td align="left" valign="top" colspan="12"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Memorization questions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correct rate (%)</td><td align="left" valign="top">63.55</td><td align="left" valign="top">73.23</td><td align="left" valign="top">87.74</td><td align="left" valign="top">90.97</td><td align="left" valign="top">91.94</td><td align="left" valign="top">91.29</td><td align="left" valign="top">91.29</td><td align="left" valign="top">91.61</td><td align="left" valign="top">97.42</td><td align="left" valign="top">95.81</td><td align="left" valign="top">89.79</td></tr><tr><td align="left" valign="top" colspan="12"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Application questions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correct rate (%)</td><td align="left" valign="top">56.57</td><td align="left" valign="top">69.71</td><td align="left" valign="top">77.14</td><td align="left" valign="top">80.57</td><td align="left" valign="top">91.43</td><td align="left" valign="top">90.29</td><td align="left" valign="top">78.86</td><td align="left" valign="top">80</td><td align="left" valign="top">88</td><td align="left" valign="top">91.43</td><td align="left" valign="top">92.78</td></tr><tr><td align="left" valign="top" colspan="12">Final exams</td></tr><tr><td align="left" valign="top" colspan="12"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Memorization questions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correct rate (%)</td><td align="left" valign="top">56.62</td><td align="left" valign="top">64.31</td><td align="left" valign="top">86.46</td><td align="left" valign="top">91.08</td><td align="left" valign="top">89.54</td><td align="left" valign="top">91.08</td><td align="left" valign="top">85.23</td><td align="left" valign="top">88</td><td align="left" valign="top">94.15</td><td align="left" valign="top">95.08</td><td align="left" valign="top">89.79</td></tr><tr><td align="left" valign="top" colspan="12"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Application questions</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correct rate (%)</td><td align="left" valign="top">67.57</td><td align="left" valign="top">64.86</td><td align="left" valign="top">89.19</td><td align="left" valign="top">94.59</td><td align="left" valign="top">89.19</td><td align="left" valign="top">91.89</td><td align="left" valign="top">75.68</td><td align="left" valign="top">78.38</td><td align="left" valign="top">91.89</td><td align="left" valign="top">89.19</td><td align="left" valign="top">92.78</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of ChatGPT variants and student cohort performance. Mean percentage scores (SD) are listed for each GPT model and a cohort of 143 fourth-year medical students on midterm and final examinations, with statistical significance (<italic>P</italic> values) for model versus student differences.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ChatGPT versions</td><td align="left" valign="bottom">GPT-3.5N</td><td align="left" valign="bottom">GPT-3.5P</td><td align="left" valign="bottom">GPT-4N</td><td align="left" valign="bottom">GPT-4P</td><td align="left" valign="bottom">GPT-4oN</td><td align="left" valign="bottom">GPT-4oP</td><td align="left" valign="bottom">GPT-4o1-mini-N</td><td align="left" valign="bottom">GPT-4o1-mini-P</td><td align="left" valign="bottom">GPT-4o1N</td><td align="left" valign="bottom">GPT-4o1P</td><td align="left" valign="bottom">Students</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="12">Midterm exams</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Original score, mean (SD)</td><td align="left" valign="top">61.03<break/>(0.84)</td><td align="left" valign="top">71.96<break/>(1.64)</td><td align="left" valign="top">83.92<break/>(1.14)</td><td align="left" valign="top">87.22<break/>(1.14)</td><td align="left" valign="top">91.75<break/>(0.71)</td><td align="left" valign="top">90.93<break/>(0.84)</td><td align="left" valign="top">86.80<break/>(0.84)</td><td align="left" valign="top">87.42<break/>(1.10)</td><td align="left" valign="top">94.02<break/>(0.45)</td><td align="left" valign="top">94.23<break/>(0.55)</td><td align="left" valign="top">89.4<break/>(7.13)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Standardized score</td><td align="left" valign="top">&#x2212;2.81</td><td align="left" valign="top">&#x2212;1.72</td><td align="left" valign="top">&#x2212;0.52</td><td align="left" valign="top">&#x2212;0.19</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.18</td><td align="left" valign="top">&#x2212;0.23</td><td align="left" valign="top">&#x2212;0.17</td><td align="left" valign="top">0.49</td><td align="left" valign="top">0.51</td><td align="left" valign="top">0.03</td></tr><tr><td align="left" valign="top" colspan="12">Final exams</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Original score, mean (SD)</td><td align="left" valign="top">60.59<break/>(0.45)</td><td align="left" valign="top">64.31<break/>(0.55)</td><td align="left" valign="top">87.84<break/>(0.55)</td><td align="left" valign="top">92.35<break/>(0.45)</td><td align="left" valign="top">90.20<break/>(0.71)</td><td align="left" valign="top">91.18<break/>(0.71)</td><td align="left" valign="top">81.57<break/>(0.45)</td><td align="left" valign="top">84.71<break/>(0.55)</td><td align="left" valign="top">92.75<break/>(0.55)</td><td align="left" valign="top">91.57<break/>(0.55)</td><td align="left" valign="top">80.2<break/>(8.73)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Standardized score</td><td align="left" valign="top">&#x2212;2.77</td><td align="left" valign="top">&#x2212;1.85</td><td align="left" valign="top">0.76</td><td align="left" valign="top">1.26</td><td align="left" valign="top">1.02</td><td align="left" valign="top">1.13</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.41</td><td align="left" valign="top">1.31</td><td align="left" valign="top">1.17</td><td align="left" valign="top">&#x2212;0.01</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Performance by Question Type</title><p><xref ref-type="table" rid="table2">Table 2</xref> presents model and student accuracy across 4 question types. All variants and students achieved the highest accuracy on MCQs, with GPT-4o1 reaching 98.5% (SD 1.2), students 92.3% (SD 5), and GPT-3.5 the lowest at 70.4% (SD 3). SAQs followed a similar pattern, ranging from 62.3% (SD 2.7) for GPT-3.5% to 92.1% (SD 1.5) for GPT-4o1, with students at 85.6% (SD 6.8). CCA yielded the greatest variability: GPT-3.5 scored 48.7% (SD 3.5) versus 88.4% (SD 2) for GPT-4o1 and 75.2% (SD 8.1) for students. IBI performance ranged from 55.2% (SD 3.1) to 90.2% (SD 1.8) for GPT-4o1, with student IBI at 78.5% (SD 7.5).</p></sec><sec id="s3-4"><title>Error Analysis by Question Type</title><p>To further elucidate model performance nuances, we analyzed error rates across question types. CCA questions exhibited an error rate approximately 3 times higher than memory recall items when answered by early ChatGPT models (GPT-3.5 and GPT-4.0). Representative examples include:</p><list list-type="bullet"><list-item><p>Memory recall: describing regulators of gastric acid secretion&#x2014;GPT-3.5 misidentified pancreatic enzymes as inhibitory due to misinterpreting &#x201C;major.&#x201D;</p></list-item><list-item><p>CCA: a 65-year-old man with acute abdominal pain&#x2014;GPT-3.5 attributed findings to pancreatitis; GPT-4o1-mini (no prompt) misdiagnosed cholecystitis.</p></list-item><list-item><p>Short answer: advantages of laparoscopic appendectomy&#x2014;GPT-4.0 only cited &#x201C;smaller incision,&#x201D; omitting recovery and complication benefits.</p></list-item><list-item><p>Image interpretation: abdominal X-ray&#x2014;GPT-3.5 confused free air with pneumatosis intestinalis; GPT-4o (with prompt) correctly identified small-bowel obstruction. Subsequent optimized variants (GPT-4o, GPT-4o1-mini, and GPT-4o1) trained on broader multilingual corpora reduced CCA error rates, with GPT-4o1 achieving 88.4% accuracy (<xref ref-type="table" rid="table2">Table 2</xref>), approaching student performance (75.2%). This analysis highlights specific failure modes and improvements in reasoning and language comprehension.</p></list-item></list></sec><sec id="s3-5"><title>Effect of Prompt Engineering</title><p>As shown in <xref ref-type="table" rid="table4">Table 4</xref>, prompt engineering significantly enhanced performance for early models. For the midterm, GPT-3.5 improved from 59.2% to 69.8% (Cohen <italic>d</italic>=1.5; <italic>P</italic>&#x003C;.001), and GPT-4.0 from 81.4% to 84.6% (Cohen <italic>d</italic>=0.7; <italic>P</italic>=.002). In contrast, advanced variants exhibited no significant benefit: GPT-4o (91.3% vs 91.6%; <italic>P</italic>=.07), GPT-4o1-mini (86.1% vs 87.4%; <italic>P</italic>=.69), and GPT-4o1 (94.1% vs 94.2%; <italic>P</italic>=.55). Similar patterns were observed in the final exam (<xref ref-type="table" rid="table4">Table 4</xref>), where prompt-engineering scores for GPT-3.5 and GPT-4.0 increased significantly (<italic>P</italic>&#x003C;.01), but not for GPT-4o (<italic>P</italic>=.94), GPT-4o1-mini (<italic>P</italic>=.58), or GPT-4o1 (<italic>P</italic>=.24).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Investigation of the scores in different prompt modes: effect of prompt engineering on ChatGPT performance. Paired comparison of mean (SD) scores and <italic>P</italic> values is listed for each model variant under no-prompt versus prompt-engineering conditions, highlighting the variable benefit of structured prompts across model generations.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Rounds</td><td align="left" valign="bottom">1</td><td align="left" valign="bottom">2</td><td align="left" valign="bottom">3</td><td align="left" valign="bottom">4</td><td align="left" valign="bottom">5</td><td align="left" valign="bottom" colspan="2">Mean (SD)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Midterm exams</td></tr><tr><td align="left" valign="top">&#x2003;GPT-3.5N</td><td align="left" valign="top">60</td><td align="left" valign="top">59</td><td align="left" valign="top">58</td><td align="left" valign="top">60</td><td align="left" valign="top">59</td><td align="left" valign="top" colspan="2">59.2 (0.84)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;GPT-3.5P</td><td align="left" valign="top">68</td><td align="left" valign="top">72</td><td align="left" valign="top">71</td><td align="left" valign="top">69</td><td align="left" valign="top">69</td><td align="left" valign="top" colspan="2">69.8 (1.64)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;GPT-4N</td><td align="left" valign="top">80</td><td align="left" valign="top">81</td><td align="left" valign="top">81</td><td align="left" valign="top">82</td><td align="left" valign="top">83</td><td align="left" valign="top" colspan="2">81.4 (1.14)</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4P</td><td align="left" valign="top">83</td><td align="left" valign="top">84</td><td align="left" valign="top">85</td><td align="left" valign="top">85</td><td align="left" valign="top">86</td><td align="left" valign="top" colspan="2">84.6 (1.14)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4oN</td><td align="left" valign="top">88</td><td align="left" valign="top">89</td><td align="left" valign="top">88</td><td align="left" valign="top">90</td><td align="left" valign="top">88</td><td align="left" valign="top" colspan="2">88.6 (0.89)</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4oP</td><td align="left" valign="top">89</td><td align="left" valign="top">90</td><td align="left" valign="top">90</td><td align="left" valign="top">89</td><td align="left" valign="top">90</td><td align="left" valign="top" colspan="2">89.6 (0.55)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4o1-miniN</td><td align="left" valign="top">91</td><td align="left" valign="top">90</td><td align="left" valign="top">92</td><td align="left" valign="top">90</td><td align="left" valign="top">91</td><td align="left" valign="top" colspan="2">90.8 (0.84)</td><td align="left" valign="top">.69</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4o1-miniP</td><td align="left" valign="top">91</td><td align="left" valign="top">91</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">90</td><td align="left" valign="top" colspan="2">91 (0.71)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4o1N</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top" colspan="2">91.8 (0.45)</td><td align="left" valign="top">.55</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4oP</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top" colspan="2">91.6 (0.55)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Final exams</td></tr><tr><td align="left" valign="top">&#x2003;GPT-3.5N</td><td align="left" valign="top">54</td><td align="left" valign="top">56</td><td align="left" valign="top">55</td><td align="left" valign="top">54</td><td align="left" valign="top">56</td><td align="left" valign="top" colspan="2">55(1)</td><td align="left" valign="top">&#x003C;.01</td></tr><tr><td align="left" valign="top">&#x2003;GPT-3.5P</td><td align="left" valign="top">61</td><td align="left" valign="top">60</td><td align="left" valign="top">60</td><td align="left" valign="top">60</td><td align="left" valign="top">60</td><td align="left" valign="top" colspan="2">60.2 (0.45)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4N</td><td align="left" valign="top">85</td><td align="left" valign="top">84</td><td align="left" valign="top">84</td><td align="left" valign="top">85</td><td align="left" valign="top">83</td><td align="left" valign="top" colspan="2">84.2 (0.84)</td><td align="left" valign="top">&#x003C;.01</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4P</td><td align="left" valign="top">89</td><td align="left" valign="top">87</td><td align="left" valign="top">87</td><td align="left" valign="top">88</td><td align="left" valign="top">88</td><td align="left" valign="top" colspan="2">87.8 (0.84)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4oN</td><td align="left" valign="top">89</td><td align="left" valign="top">90</td><td align="left" valign="top">90</td><td align="left" valign="top">90</td><td align="left" valign="top">90</td><td align="left" valign="top" colspan="2">89.8 (0.45)</td><td align="left" valign="top">.94</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4oP</td><td align="left" valign="top">90</td><td align="left" valign="top">91</td><td align="left" valign="top">90</td><td align="left" valign="top">91</td><td align="left" valign="top">90</td><td align="left" valign="top" colspan="2">90.4 (0.55)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4o1-miniN</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top">91</td><td align="left" valign="top" colspan="2">91.4 (0.55)</td><td align="left" valign="top">.58</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4o1-miniP</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top" colspan="2">91.6 (0.55)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4o1N</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top" colspan="2">91.4 (0.55)</td><td align="left" valign="top">.24</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4oP</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top">92</td><td align="left" valign="top" colspan="2">91.8 (0.45)</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Stability Across Runs</title><p>Coefficient of variation (CV) across the 5 independent runs decreased with model version. GPT-3.5 midterm CV was 3.5%, whereas GPT-4o1 recorded a CV of 0.6%. Prompt engineering reduced CV by an average of 0.4 percentage points for GPT-3.5 and GPT-4.0, but had a negligible impact on optimized variants.</p><p>These findings demonstrate not only a clear progression in raw performance and stability from GPT-3.5 to GPT-4o1, but also that the top-tier optimized models can match or surpass human student performance (<xref ref-type="table" rid="table3">Table 3</xref>), indicating their potential as both assessment tools and educational companions.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study provides the first systematic evaluation of prompt engineering across multiple ChatGPT variants, highlighting the evolution of LLM capabilities in medical education settings. We observed that GPT-4 variants (GPT-4o, GPT-4o1-mini, and GPT-4o1) significantly outperformed earlier models, consistent with the findings by Kung et al [<xref ref-type="bibr" rid="ref7">7</xref>] that ChatGPT 4.0 surpassed ChatGPT 3.5 on the United States Medical Licensing Examination, achieving accuracy at or near the passing threshold. In our work, advanced models not only exhibited higher baseline scores but also demonstrated greater stability across repeated runs, underscoring architectural improvements in reasoning and context retention.</p><p>Prompt engineering yielded substantial performance gains for early-generation models&#x2014;GPT-3.5 and GPT-4&#x2014;mirroring reports that structured guidance can boost LLM accuracy [<xref ref-type="bibr" rid="ref4">4</xref>], but its use diminished for optimized variants. Safrai and Azaria [<xref ref-type="bibr" rid="ref8">8</xref>] found that GPT-4 maintained performance even when confronted with extraneous &#x201C;small talk&#x201D; inserted into medical prompts, whereas GPT-3.5&#x2019;s performance degraded under similar conditions. Our findings extend this observation, showing that GPT-4o and its successors exhibit minimal dependency on explicit prompt structures, suggesting that these models have internalized reasoning scaffolds natively.</p><p>Our analysis by question type aligns with the in-depth evaluation by Knoedler et al [<xref ref-type="bibr" rid="ref9">9</xref>], who reported variable ChatGPT performance across categories and a negative correlation with question difficulty (r_s=&#x2212;0.306; <italic>P</italic>&#x003C;.001) in the United States Medical Licensing Examination step 1 practice items. Similarly, we noted that CCA and IBI posed the greatest challenges for all models, although advanced variants narrowed the gap. These parallels reinforce the generalizability of LLM behavior across diverse educational assessment formats.</p><p>Our error analysis underscores the importance of evaluating LLMs not only by overall scores but also by question-type vulnerabilities. Early models&#x2019; difficulties with multistep reasoning and complex Chinese phrasing, especially in clinical scenarios and image-based tasks, point to inherent limitations in contextual understanding. The marked reduction of these errors in optimized variants demonstrates progress but also indicates areas where artificial intelligence (AI) may still mislead learners. Educators should therefore integrate error-focused feedback loops when deploying LLMs: by exposing students to AI-generated mistakes in controlled settings, learners can develop critical appraisal skills and better discern AI hallucinations. This approach transforms AI from a mere answer engine into a pedagogical tool that actively fosters analytical thinking and deep learning.</p></sec><sec id="s4-2"><title>Comparison With Medical Student Performance</title><p>The cohort of 143 fourth-year medical students achieved a mean midterm score of 89.4% (SD 7.13) and a mean final score of 80.2% (SD 8.73) (<xref ref-type="table" rid="table3">Table 3</xref>). GPT-3.5 underperformed relative to students (59.2% vs 89.4%; <italic>P</italic>&#x003C;.001 and 55% vs 80.2%; <italic>P</italic>&#x003C;.001), whereas advanced variants such as GPT-4o1 matched or exceeded student performance on both the midterm (94.02% vs 89.4%; <italic>P</italic>&#x003C;.001) and final exams (92.75% vs 80.2%; <italic>P</italic>&#x003C;.001). This indicates that top-tier LLMs can approach or surpass human proficiency in standardized medical assessments.</p></sec><sec id="s4-3"><title>AI as a Learning Companion Beyond Assessment</title><p>Advanced LLMs show promise as AI-enabled educational tools, capable of rapidly synthesizing complex medical knowledge to aid student understanding. Studies have demonstrated AI&#x2019;s use in generating personalized explanations and feedback that enhance learning efficiency [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. However, LLMs may still produce errors and &#x201C;hallucinations&#x201D; [<xref ref-type="bibr" rid="ref10">10</xref>], underscoring the importance of maintaining critical appraisal and scholarly rigor when integrating AI into medical education.</p></sec><sec id="s4-4"><title>Educational Value for Diagnosing Student Weaknesses</title><p>Although AI does not achieve 100% accuracy, its overall correctness surpassed that of most medical students in our cohort. Students spend significant time retrieving correct answers and understanding explanations; AI can serve as a learning companion by rapidly aggregating and summarizing complex medical knowledge, guiding step-by-step reasoning. Integrating our AI system into adaptive learning platforms could help students quickly identify weak areas, practice targeted question types, and maintain critical appraisal to avoid overreliance on AI outputs. This targeted approach not only enhances learning efficiency but also empowers students to become more self-directed learners, using AI as a diagnostic tool to identify knowledge gaps and focus their study efforts where they are most needed.</p></sec><sec id="s4-5"><title>Strategies to Reduce AI Hallucinations</title><p>To mitigate risks of LLM-generated misinformation, future implementations should consider several evidence-based strategies. Cross-model consensus approaches&#x2014;querying multiple LLMs (eg, GPT-4o1 and open-source alternatives) and adopting majority-vote answers&#x2014;can increase reliability and reduce single-model biases. Expert fine-tuning using annotated medical datasets would strengthen domain-specific accuracy, particularly for specialized clinical scenarios. Integration of real-time evidence retrieval through literature and guideline search APIs would ensure responses include verifiable reference citations, enhancing transparency and trustworthiness. In addition, implementing confidence scoring systems coupled with human-AI collaboration frameworks would route low-confidence responses to human experts for review, creating a safety net against potential hallucinations while maintaining efficiency.</p><p>Collectively, these results underscore a maturation of LLMs: as model architectures advance, the marginal benefit of prompt engineering declines, and the potential educational role shifts from prompt design to strategic integration and fine-tuning. For practitioners and educators, this suggests a shift from elaborate prompt design toward focusing on model selection and integration strategies&#x2014;such as multimodal input handling and curriculum-specific fine-tuning&#x2014;to maximize efficacy in high-stakes assessments. Future research should explore adaptive prompting frameworks that tailor AI guidance to learner needs and investigate real-world clinical scenario applications, while prioritizing the development of robust safeguards against AI hallucinations to ensure safe and effective integration into medical education curricula.</p></sec><sec id="s4-6"><title>Conclusions</title><p>This comprehensive evaluation across 5 ChatGPT variants demonstrates a progressive enhancement in performance and stability in medical examination tasks. Notably, optimized LLMs (GPT-4o, GPT-4o1-mini, and GPT-4o1) not only matched but significantly exceeded the mean scores of fourth-year medical students on both midterm and final exams, underscoring their capacity to approach&#x2014;or surpass&#x2014;human proficiency in standardized assessments.</p><p>While prompt engineering substantially improved outcomes for early-generation models (GPT-3.5 and GPT-4.0), optimized variants achieved near-ceiling accuracy with negligible gains from structured prompts, indicating that these models inherently internalize contextual guidance. These findings suggest a strategic pivot for educators and assessment designers: from intricate prompt crafting toward thoughtful model selection, multimodal integration, and domain-specific fine-tuning.</p><p>Furthermore, the ability of advanced LLMs to rapidly synthesize and organize complex medical knowledge positions them as valuable AI-enabled learning companions. Educators should leverage AI&#x2019;s strengths in personalized explanation and feedback while maintaining rigorous critical appraisal to identify potential errors or &#x201C;hallucinations.&#x201D;</p><p>Future research should investigate adaptive prompting frameworks tailored to individual learner needs and assess the educational impact of AI-augmented tools in real-world clinical training environments.</p></sec></sec></body><back><ack><p>This work was supported by grant CSH2019A019.</p></ack><fn-group><fn fn-type="con"><p>M-YH conceived and designed the study, curated the data, conducted the statistical analysis (IBM SPSS Statistics; version 29), and drafted the manuscript. T-LW contributed to data curation, examination, administration, and manuscript review. P-HS and M-CC supervised the project, contributed to study design and interpretation, and critically revised the manuscript. M-CC is the corresponding author and P-HS is the co-corresponding author.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CCA</term><def><p>clinical case analysis</p></def></def-item><def-item><term id="abb2">CV</term><def><p>coefficient of variation</p></def></def-item><def-item><term id="abb3">IBI</term><def><p>image-based interpretation</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MCQ</term><def><p>multiple choice question</p></def></def-item><def-item><term id="abb6">SAQ</term><def><p>short answer question</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><year>2020</year><access-date>2025-09-25</access-date><conf-name>Neural Information Processing Systems (NeurIPS) 2020</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><fpage>1877</fpage><lpage>1901</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf">https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>GPT technical report</article-title><source>OpenAI</source><year>2022</year><access-date>2025-09-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/papers/gpt-4.pdf">https://cdn.openai.com/papers/gpt-4.pdf</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>GPT-4 technical report</article-title><source>OpenAI</source><year>2023</year><month>03</month><day>14</day><access-date>2025-09-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/research/gpt-4">https://openai.com/research/gpt-4</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hayashi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name></person-group><article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title><source>ACM Comput Surv</source><year>2023</year><month>09</month><day>30</day><volume>55</volume><issue>9</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3560815</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><article-title>GPT-4o technical report: multimodal capabilities</article-title><source>OpenAI</source><year>2024</year><access-date>2025-09-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/gpt-4o-system-card/">https://openai.com/index/gpt-4o-system-card/</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>GPT-4o1mini release notes</article-title><source>OpenAI</source><year>2025</year><access-date>2025-09-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://help.openai.com/en/articles/6825453-chatgpt-release-notes">https://help.openai.com/en/articles/6825453-chatgpt-release-notes</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Safrai</surname><given-names>M</given-names> </name><name name-style="western"><surname>Azaria</surname><given-names>A</given-names> </name></person-group><article-title>Performance of ChatGPT-35 and GPT-4 on the United States Medical Licensing Examination with and without distractions</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 12, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.08625</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Knoedler</surname><given-names>L</given-names> </name><name name-style="western"><surname>Knoedler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><etal/></person-group><article-title>In-depth analysis of ChatGPT&#x2019;s performance based on specific signaling words and phrases in the question stem of 2377 USMLE step 1 style questions</article-title><source>Sci Rep</source><year>2024</year><month>06</month><day>12</day><volume>14</volume><issue>1</issue><fpage>13553</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-63997-7</pub-id><pub-id pub-id-type="medline">38866891</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beutel</surname><given-names>G</given-names> </name><name name-style="western"><surname>Geerits</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kielstein</surname><given-names>JT</given-names> </name></person-group><article-title>Artificial hallucination: GPT on LSD?</article-title><source>Crit Care</source><year>2023</year><month>04</month><day>18</day><volume>27</volume><issue>1</issue><fpage>148</fpage><pub-id pub-id-type="doi">10.1186/s13054-023-04425-6</pub-id><pub-id pub-id-type="medline">37072798</pub-id></nlm-citation></ref></ref-list></back></article>