<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e81807</article-id><article-id pub-id-type="doi">10.2196/81807</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Models for the National Radiological Technologist Licensure Examination in Japan: Cross-Sectional Comparative Benchmarking and Evaluation of Model-Generated Items Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ito</surname><given-names>Toshimune</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ishibashi</surname><given-names>Toru</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hayashi</surname><given-names>Tatsuya</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kojima</surname><given-names>Shinya</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sogabe</surname><given-names>Kazumi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Radiological Technology, Faculty of Medical Technology, Teikyo University</institution><addr-line>2-11-1 Kaga, Itabashi-ku</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Medical Radiology, Graduate School of Medical Technology, Teikyo University</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Department of Medical Radiological Technology, Faculty of Health Sciences, Kyorin University</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Department of Radiology, Tokyo Women&#x2019;s Medical University Adachi Medical Center</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Department of Radiological Sciences, School of Health Sciences, Ibaraki Prefectural University of Health Sciences</institution><addr-line>Ibaraki</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Stone</surname><given-names>Alicia</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Leung</surname><given-names>Tiffany</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sarangi</surname><given-names>Pradosh Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Court-Kowalski</surname><given-names>Stefan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Fukui</surname><given-names>Yusuke</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Toshimune Ito, PhD, Department of Radiological Technology, Faculty of Medical Technology, Teikyo University, 2-11-1 Kaga, Itabashi-ku, Tokyo, 173-8605, Japan, +81-3-3964-7053; <email>toito@med.teikyo-u.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>13</day><month>11</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e81807</elocation-id><history><date date-type="received"><day>08</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>28</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Toshimune Ito, Toru Ishibashi, Tatsuya Hayashi, Shinya Kojima, Kazumi Sogabe. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 13.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e81807"/><abstract><sec><title>Background</title><p>Mock examinations are widely used in health professional education to assess learning and prepare candidates for national licensure. However, instructor-written multiple-choice items can vary in difficulty, coverage, and clarity. Recently, large language models (LLMs) have achieved high accuracy in medical examinations, highlighting their potential for assisting item-bank development; however, their educational quality remains insufficiently characterized.</p></sec><sec><title>Objective</title><p>This study aimed to (1) identify the most accurate LLM for the Japanese National Examination for Radiological Technologists and (2) use the top model to generate blueprint-aligned multiple-choice questions and evaluate their educational quality.</p></sec><sec sec-type="methods"><title>Methods</title><p>Four LLMs&#x2014;OpenAI o3, o4-mini, o4-mini-high (OpenAI), and Gemini 2.5 Flash (Google)&#x2014;were evaluated on all 200 items of the 77th Japanese National Examination for Radiological Technologists in 2025. Accuracy was analyzed for overall items and for 173 nonimage items. The best-performing model (o3) then generated 192 original items across 14 subjects by matching the official blueprint (image-based items were excluded). Subject-matter experts (&#x2265;5 y as coordinators and routine mock examination authors) independently rated each generated item on five criteria using a 5-point scale (1=unacceptable, 5=adoptable): item difficulty, factual accuracy, accuracy of content coverage, appropriateness of wording, and instructional usefulness. Cochran Q with Bonferroni-adjusted McNemar tests compared model accuracies, and one-sided Wilcoxon signed-rank tests assessed whether the median ratings exceeded 4.</p></sec><sec sec-type="results"><title>Results</title><p>OpenAI o3 achieved the highest accuracy overall (90.0%; 95% CI 85.1%&#x2010;93.4%) and on nonimage items (92.5%; 95% CI 87.6%&#x2010;95.6%), significantly outperforming o4-mini on the full set (<italic>P</italic>=.02). Across models, accuracy differences on the non-image subset were not significant (Cochran Q, <italic>P</italic>=.10). Using o3, the 192 generated items received high expert ratings for item difficulty (mean, 4.29; 95% CI 4.11&#x2010;4.46), factual accuracy (4.18; 95% CI 3.98&#x2010;4.38), and content coverage (4.73; 95% CI 4.60&#x2010;4.86). Ratings were comparatively lower for appropriateness of wording (3.92; 95% CI 3.73&#x2010;4.11) and instructional usefulness (3.60; 95% CI 3.41&#x2010;3.80). For these two criteria, the tests did not support a median rating &#x003E;4 (one-sided Wilcoxon, <italic>P</italic>=.45 and <italic>P</italic>&#x2265;.99, respectively). Representative low-rated examples (ratings 1&#x2010;2) and the rationale for those scores&#x2014;such as ambiguous phrasing or generic explanations without linkage to stem cues&#x2014;are provided in the supplementary materials.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>OpenAI o3 can generate radiological licensure items that align with national standards in terms of difficulty, factual correctness, and blueprint coverage. However, wording clarity and the pedagogical specificity of explanations were weaker and did not meet an adoptable threshold without further editorial refinement. These findings support a practical workflow in which LLMs draft syllabus-aligned items at scale, while faculty perform targeted edits to ensure clarity and formative feedback. Future studies should evaluate image-inclusive generation, use Application Programming Interface (API)-pinned model snapshots to increase reproducibility, and develop guidance to improve explanation quality for learner remediation.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>licensing exam</kwd><kwd>radiology, educational evaluation</kwd><kwd>medical education</kwd><kwd>item generation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Mock examinations are a key pedagogical tool in training programs for health professionals. These are designed to consolidate the knowledge required for national licensure and to gauge students&#x2019; achievement [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. In particular, multiple-choice formats are valuable because they enable the systematic, efficient appraisal of the broad foundational knowledge expected in clinical practice, making them integral to the quality of the curriculum. However, most items are written by individual instructors that draw on past examinations or personal clinical experience, and their difficulty and content validity are rarely subjected to systematic review [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. These can result in biases in content coverage, inconsistencies in wording, and variable educational usefulness, which undermine the stability of learning outcome assessments.</p><p>Several studies have reported the high accuracy of large language models (LLMs) in health professional licensure examinations, owing to their rapid advancements [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. In text-based multiple-choice questions, models have begun to match or surpass human test-takers while generating rationales and keyword-level explanations that can serve as formative feedback [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. These suggest the potential utility of LLM-assisted item writing during the construction of high-quality question banks. However, most research has centered on the accuracy of LLMs in answering existing licensure items [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>], while empirical evidence regarding the educational quality of questions authored by LLMs remains scarce [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. A comprehensive appraisal that includes (1) appropriate difficulty, (2) completeness and accuracy of content coverage, (3) clarity of option wording, and (4) usefulness of accompanying explanations is necessary to address this knowledge gap and clarify the practical value of artificial intelligence (AI)-supported mock examinations, as well as its limitations.</p><p>This study evaluated the quality of AI-generated multiple-choice questions based on the Japanese National Examination for Radiological Technologists. Several LLMs were used to answer the exam, then the highest-performing model was used to generate a set of mock items. These AI-generated questions were then evaluated across several aspects (ie, item-level difficulty, item-level factual accuracy, accuracy of content coverage, appropriateness of wording, and instructional usefulness) through blinded expert review and statistical analysis. By doing so, this study aims to provide empirical data on the educational soundness of AI-generated items, as well as highlight any emerging challenges.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Models and Study Period</title><p>Four LLMs released in February 2025 were evaluated: OpenAI o3, OpenAI o4-mini, OpenAI o4-mini-high (all OpenAI), and Gemini 2.5 Flash (Google). The evaluations were conducted from March 14 to May 8, 2025, using the publicly accessible browser interfaces, with the desired engine explicitly selected in each platform&#x2019;s menu. The browser access was chosen to mirror typical educational use and to simplify image I/O (upload, preview, and per-item attachment). The item-generation study was conducted from May 15 to June 28, 2025, using OpenAI o3, the model with the highest answer accuracy. To ensure consistency, we used an identical Japanese prompt template across models. To avoid carryover effects, we started a new session for each 50-item batch with the OpenAI models and used per-item input with Gemini; image files (PNG) were attached when required by an item. As browsing and memory features were disabled, outputs relied solely on pretrained parameters and the provided materials.</p></sec><sec id="s2-2"><title>Answer Accuracy</title><p>Answer accuracy was assessed based on all 200 items of the 77th National Examination for Radiological Technologists, administered on February 20, 2025. All items were multiple-choice, and question stems containing images were presented unchanged. Each model was given the question stem and options in Japanese, then instructed to select the correct answers in single-best or multiple-select format. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> lists the subjects and the number of items per subject. Due to the differences in each model, the input procedures were adapted accordingly. For OpenAI models, stems and options were pasted from four text files (items 1&#x2010;50, 51&#x2010;100, 101&#x2010;150, and 151&#x2010;200) into separate sessions. PNG files were attached for each image item, with the filenames labeled to match the corresponding item numbers. However, since Gemini permits only one file upload, the stems and options were pasted directly into the prompt while attaching an image file as needed. All inputs were entered manually. A concrete workflow is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Representative interaction with a large language model (LLM). This diagram illustrates the workflow used to evaluate the answer accuracy of large language models. The LLMs were given prompts to answer each question (including text and images when applicable) in Japanese, with specific instructions for answer selection and formatting. The output included the selected answer, a confidence score, and a brief explanation. All actual prompts and inputs were entered in Japanese, but this example is shown in English for illustration purposes. CT: computed tomography.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e81807_fig01.png"/></fig><p>The outputs of the model were compared to the official answer key issued by the Ministry of Health, Labor and Welfare. The correct and incorrect responses were counted overall for 200 items and separately for the 173 items that did not require image interpretation (ie, nonimage items). Statistical significance was tested across models.</p></sec><sec id="s2-3"><title>Item Generation</title><sec id="s2-3-1"><title>Generation Procedure</title><p>The mock items were generated using OpenAI o3, since it had the highest accuracy among all four models. Image-based stems were excluded since all models performed poorly on these. Using the same examination as a blueprint, OpenAI o3 was used to produce 192 questions across 14 subjects (<xref ref-type="table" rid="table1">Table 1</xref>), matching the same distribution of items. The model was supplied with text files containing the past 5 years of examination items and the official test specifications, ensuring its alignment with test objectives. Browsing remained disabled. Since Healthcare Safety Management is a new domain introduced in 2025, thereby lacking any historical reference items, it was excluded from the mock item generation. Items were generated separately for each subject in Japanese, and each output included the stem, five options, the key, and a brief rationale.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of artificial intelligence (AI)-Generated Mock Items.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Subject</td><td align="left" valign="bottom">Blueprint target (n=200)</td><td align="left" valign="bottom">Generated (n=192)</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic Imaging Techniques</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">Nuclear Medicine Technology</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">Radiation Therapy Technology</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">Medical Imaging Informatics</td><td align="left" valign="top">10</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">Basic Medical Sciences</td><td align="left" valign="top">30</td><td align="left" valign="top">30</td></tr><tr><td align="left" valign="top">Radiation Science &#x0026; Engineering</td><td align="left" valign="top">36</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top">X-ray Imaging Equipment</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">X-ray Imaging Techniques</td><td align="left" valign="top">20</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">Image Engineering</td><td align="left" valign="top">6</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top">Radiation Safety Management</td><td align="left" valign="top">10</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">Healthcare Safety Management<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">8</td><td align="left" valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Since Healthcare Safety Management was only recently introduced as a new subject in the 2025 blueprint, it was excluded from the mock item generation.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2-4"><title>Evaluation of Generated Items</title><p>All 192 generated questions were reviewed by experts of the subject matter; these were faculty members with at least 5 years of experience as subject coordinators in radiological technology programs and who routinely author mock examinations. Items were assigned to reviewers by discipline, and each question was evaluated by one expert. The reviewers rated each item on a five-point scale: (1) unacceptable; (2) major revision needed; (3) revisable; (4) minor revision; and (5) adoptable across five criteria including, item difficulty, factual accuracy, accuracy of content coverage, appropriateness of wording, and instructional usefulness.</p><p>For each criterion, we calculated the median score and tested the statistical significance of the proportion of high ratings (&#x2265;4). The evaluation framework, which is based on faculty experience with national examination item writing, is presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Evaluation of generated items.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Evaluation criterion</td><td align="left" valign="bottom">Rating scale<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Item difficulty</td><td align="left" valign="top">1&#x2010;5</td></tr><tr><td align="left" valign="top">Factual accuracy</td><td align="left" valign="top">1&#x2010;5</td></tr><tr><td align="left" valign="top">Accuracy of content coverage</td><td align="left" valign="top">1&#x2010;5</td></tr><tr><td align="left" valign="top">Appropriateness of wording</td><td align="left" valign="top">1&#x2010;5</td></tr><tr><td align="left" valign="top">Instructional usefulness</td><td align="left" valign="top">1&#x2010;5</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Rating scale definition: 1=Unacceptable; 2=Major revision needed; 3=Revisable; 4=Minor revision; 5=Adoptable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Statistical Analysis</title><p>Statistical analysis was performed using JMP (version 18; JMP Statistical Discovery LLC). Cochran Q test was initially used to examine overall differences in answer accuracy; when significant, pairwise differences were probed with McNemar test using Bonferroni correction. The item generation study used a one-sided Wilcoxon signed-rank test (H&#x2080;: median &#x2264;4). Statistical significance was set at <italic>P</italic>&#x003C;.05 for all analyses.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study did not involve human participants or patient-identifiable data. The Ethics Committee of Teikyo University reviewed the project and determined that formal ethical approval was not required because the work evaluated the quality of test items and did not constitute human medical research. Accordingly, informed consent was not applicable.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Answer Accuracy</title><p>The accuracy of the LLMs on the full 200-item set and the nonimage 173-item set is shown in <xref ref-type="table" rid="table3">Table 3</xref>. All models consistently scored lower in the full set versus the nonimage set, with OpenAI o3 achieving the best results at 90% and 92.5%, respectively. A significant difference was seen between OpenAI o3 and OpenAI o4-mini on the full set, whereas no significant differences were seen among models on the nonimage set.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Model accuracies and statistical comparisons on 200 benchmark questions and 173 nonimage questions.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variables</td><td align="left" valign="bottom">200 questions<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">173 nonimage questions<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Model accuracy</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini-high, %</td><td align="left" valign="top">86.0 (80.5, 90.1)</td><td align="left" valign="top">88.4 (82.8, 92.4)</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini, %</td><td align="left" valign="top">82.5 (76.6, 87.1)</td><td align="left" valign="top">86.7 (80.8, 91.0)</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o3, %</td><td align="left" valign="top">90.0 (85.1, 93.4)</td><td align="left" valign="top">92.5 (87.6, 95.6)</td></tr><tr><td align="left" valign="top">&#x2003;Gemini 2.5 Flash, %</td><td align="left" valign="top">83.0 (77.2, 87.6)</td><td align="left" valign="top">89.6 (84.1, 93.3)</td></tr><tr><td align="left" valign="top">&#x2003;Cochran Q test (<italic>P</italic> value)</td><td align="left" valign="top">.01</td><td align="left" valign="top">.10</td></tr><tr><td align="left" valign="top" colspan="3">Pairwise McNemar test (Bonferroni-adjusted <italic>P</italic> value)</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini-high versus OpenAI-o4-mini</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini-high versus OpenAI-o3</td><td align="left" valign="top">.44</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini-high versus Gemini 2.5 Flash</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini versus OpenAI-o3</td><td align="left" valign="top">.02</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o4-mini versus Gemini 2.5 Flash</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;OpenAI-o3 versus Gemini 2.5 Flash</td><td align="left" valign="top">.06</td><td align="left" valign="top">N/A</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup> Accuracy shown with 95% CIs in parentheses (Wilson score, two-sided, without continuity correction).</p></fn><fn id="table3fn2"><p><sup>b</sup>Not applicable. </p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Item Generation</title><p><xref ref-type="table" rid="table4">Table 4</xref> presents the scores and statistics for all 192 questions, while <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the prompt template and sample outputs. Among item difficulty, factual accuracy, and accuracy of content coverage, the medians and the proportions of scores &#x2265;4 did not differ significantly, although accuracy of content coverage had the highest score. Meanwhile, instructional usefulness had a significantly lower score than appropriateness of wording. The evaluation criteria and evaluation examples of items that scored 1&#x2010;2 for the lower-scoring criteria&#x2014;appropriateness of wording and instructional usefulness&#x2014;are detailed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Reviewer ratings by evaluation criterion for the AI-generated items (n=192).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Evaluation criterion<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">Mean score (95% CI)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Item difficulty</td><td align="left" valign="top">4.29 (4.11, 4.46)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Factual accuracy</td><td align="left" valign="top">4.18 (3.98, 4.38)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top">Accuracy of content coverage</td><td align="left" valign="top">4.73 (4.60, 4.86)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Appropriateness of wording</td><td align="left" valign="top">3.92 (3.73, 4.11)</td><td align="left" valign="top">.44</td></tr><tr><td align="left" valign="top">Instructional usefulness</td><td align="left" valign="top">3.60 (3.41, 3.80)</td><td align="left" valign="top">&#x2265;.99</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup> &#x201C;Evaluation criterion&#x201D; refers to the five evaluation criteria defined in <xref ref-type="table" rid="table2">Table 2</xref>.</p></fn><fn id="table4fn2"><p><sup>b</sup>One-sided Wilcoxon signed-rank test against the null hypothesis such that the median score is &#x2264;4.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Prompt summary and representative example of item generation. (A) Summary of the prompts used to instruct the language model to generate original mock questions aligned with the National Examination for Radiological Technologists. The summary outlines the role of the model, input references, specifications of generation, item-creation rules, and output format. (B) The actual prompt and representative response generated by the model. The prompt included specific formatting and content-generation instructions written in Japanese. The response shows the generated item, correct answers, and explanation in Japanese.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e81807_fig02.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study compared four LLMs in terms of answer accuracy on the Japanese National Examination for Radiological Technologists. The top performer, OpenAI o3, was used to generate the mock test, which was then evaluated by experts in terms of educational quality. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, on the full set of items, only the comparison between the OpenAI o3 and OpenAI-o4-mini variant reached statistical significance; when image-based items were excluded, no model differences were observed.</p><p>To contextualize the observed accuracy differences, we briefly summarize the multimodal architectures and vision&#x2013;language pipelines of the evaluated models as they pertain to radiologic image questions. Built on a GPT-4 lineage, OpenAI o3 integrates a high-resolution visual encoder with unified attention over linguistic and visual tokens [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], likely enhancing sensitivity to low-contrast findings and subtle anatomical cues typical of radiography and CT. In contrast, OpenAI o4-mini is a lightweight variant with reduced-resolution patch embeddings [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], which can yield coarser visual representations and miss subtle image cues. OpenAI o4-mini-high supplements the mini architecture with targeted medical-image fine-tuning and partial recovery of high-resolution inputs [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], consistent with improved mapping of relevant visual patterns. Lastly, Gemini 2.5 Flash uses a two-tower design in which an external vision encoder converts images to tags prior to language processing [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]; such pipelines may incur information loss for domain-specific anatomical details. In line with these architectural differences, performance gaps emerged on image-based questions but not on text-only items.</p><p>The pronounced performance spread on image-based questions could be mainly attributed to the aggressive parameter reduction in OpenAI o4-mini and the information loss inherent in the image-to-tag pipeline in Gemini, both of which weaken visual feature representation. Thus, current systems may not fully capture clinically grounded context and the knowledge required for radiologic image interpretation. This finding is consistent with the results of previous studies reporting similar limitations in specialty radiology examinations [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. However, OpenAI o3 and o4-mini-high have higher resolution encoders and benefit from medical-specific fine-tuning. However, due to the limited sample sizes and proprietary nature of the detailed model architectures, these explanations remain partly hypothetical. Nevertheless, these findings highlight the importance of the visual module scale and the presence of medical-domain training when selecting an LLM for the development of -generated questions in this field.</p><p>Building on these findings, the 192 items generated by the top model were reviewed across five educational criteria. Item difficulty, factual accuracy, and content coverage were rated favorably, indicating alignment with national expectations and the official blueprint [<xref ref-type="bibr" rid="ref26">26</xref>]. By contrast, appropriateness of wording and instructional usefulness were comparatively weaker, with reviewers noting ambiguous phrasing and explanations that did not consistently link stem cues to the correct answer or to distractor misconceptions. These strengths and weaknesses are consistent with observations from related medical-education settings [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>] and underscore the need for editorial refinement prior to instructional deployment.</p><p>This study has several limitations. First, the image-based items were excluded from expert review, thus precluding the assessment of visual tasks. Second, each question was evaluated by a single expert, and thus inter-rater reliability could not be assessed. Third, reproducibility is limited by the use of publicly accessible browser interfaces. All evaluations were conducted through browser UIs with visible labels: OpenAI o3, OpenAI o4-mini, OpenAI o4-mini-high, and Gemini 2.5 Flash. Although this choice mirrors typical educational use and simplifies image I/O, it limits control over versioning and decoding parameters. Prompt delivery also varied across platforms due to UI constraints: OpenAI models received items in 50-question batches per session, whereas Gemini required per-item input, with a single image upload when applicable. Such differences in prompt granularity, context priming, and file-attachment workflows may have influenced outputs and should be considered when interpreting the comparable performance of Gemini Flash and o3. To mitigate these effects, we used an identical Japanese prompt template, disabled memory features, initiated new sessions for each batch, preserved the original exam order, and performed a single pass per item without retries. Input handling is detailed in the Methods section. These input structures reflected platform UI constraints (OpenAI allowed 50-question batches per session, whereas Gemini required per-item prompts and a single image attachment when applicable); although memory features were disabled and each batch began in a new session, processing the OpenAI items in batches could still introduce minor within-session priming; therefore, residual order effects cannot be fully excluded. Application-level temperature settings were not user-configurable. Moreover, because decoding remained stochastic and we performed a single pass per item without retries, run-to-run response variability cannot be fully excluded even with identical prompts. Given that browser-based services can update without notice, outputs may drift over time even when identical prompts and labels are used [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Thus, to strengthen version control and reproducibility, future studies should standardize prompt injection through Application Programming Interface endpoints with pinned model snapshots, identical per-item wrappers, and fully logged metadata (prompt templates, model identifiers, timestamps, and decoding parameters). In the future, visual encoders are expected to operate at a higher resolution and undergo additional tuning for medical domains. This could enable LLMs to automatically generate image-based items across modalities (eg, computed tomography, magnetic resonance imaging, and ultrasound), thus bringing mock exams closer to clinical reality. Further improvements in the feedback system could also be seen. By delivering adaptive feedback that varies in depth according to each learner&#x2019;s proficiency, students can be provided with on-demand, targeted remediation material. LLMs could also be used to map items to the national blueprint in real time, enabling the detection and correction of domain imbalances while reducing faculty workload. Lastly, aligning these models with overseas licensure frameworks could expand their use to ultimately support a multilingual, multi-profession, international mock-exam bank.</p></sec><sec id="s4-2"><title>Conclusions</title><p>This study demonstrated that an LLM (OpenAI o3) can attain high accuracy on national radiological technology examination, as well as generate new multiple-choice items with appropriate difficulty, factual correctness, and syllabus coverage, as evaluated by experts. Although the AI-generated questions fell short in terms of wording clarity and pedagogical feedback, these can be mitigated through targeted editorial review. Practically speaking, LLMs can be used to draft content that is eventually refined by the faculty. This workflow could enable the more efficient development of mock examinations and reinforce curriculum alignment without imposing additional burden on instructors. However, performance gaps on image-based items, the absence of inter-rater reliability data, and the inherent volatility of cloud-hosted models underscore the need for cautious implementation and transparent reporting of model metadata. Nevertheless, future advancements in high-resolution visual encoders and medical-specific tuning can close this multimodal gap, while adaptive feedback functions and automated blueprint mapping can further extend the educational value of AI-generated assessments. After overcoming these barriers in terms of technical improvements and reproducibility safeguards, LLMs can be a strong asset in radiological technology education, which can even extend to the licensure preparations of other allied health professionals worldwide.</p></sec></sec></body><back><ack><p>The authors thank Hiroki Ohtani, Hiroki Saito, Tatsuru Ota, Kiyoshi Hishiki, and Masao Fujihara of Teikyo University for their careful evaluation of the problem statements and for the constructive feedback that strengthened this study. Disclosure of generative AI use (language editing only). We used OpenAI o3 solely to assist with language editing (readability, clarity, and minor stylistic consistency). No AI tools were used to generate scientific content, analyze or interpret data, or determine conclusions. All statements and references were verified by the authors, who take full responsibility for the final manuscript. Editing with OpenAI o3 was performed interactively; the manuscript wording was subsequently finalized by Enago Co., Ltd.</p></ack><notes><sec><title>Funding</title><p>This research received no specific grant from any funding agency in the public, commercial, or not-for-profit sectors. The article processing charge for open access publication was supported by Teikyo University&#x2019;s Open Access Publication Support Program. The funder had no role in the study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p></sec><sec><title>Data Availability</title><p>Data sharing is not applicable to this article as no datasets were generated or analyzed during this study.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: T Ito, KS</p><p>Data Curation: T Ishibashi</p><p>Software: SK</p><p>Formal Analysis: TH</p><p>Writing &#x2013; Original Draft: T Ito</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">UI</term><def><p>user interface</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Sheikh</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Albaker</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ayub</surname><given-names>MZ</given-names> </name></person-group><article-title>Do mock medical licensure exams improve performance of graduates? Experience from a Saudi Medical College</article-title><source>Saudi J Med Sci</source><year>2022</year><volume>10</volume><issue>2</issue><fpage>157</fpage><lpage>161</lpage><pub-id pub-id-type="doi">10.4103/sjmms.sjmms_173_21</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scott</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Shanks</surname><given-names>AL</given-names> </name></person-group><article-title>Impact of an online question bank on Resident In-Training exam performance</article-title><source>J Med Educ Curric Dev</source><year>2023</year><volume>10</volume><fpage>23821205231206221</fpage><pub-id pub-id-type="doi">10.1177/23821205231206221</pub-id><pub-id pub-id-type="medline">37822782</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Siab</surname><given-names>F</given-names> </name><name name-style="western"><surname>Morrissey</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ball</surname><given-names>P</given-names> </name></person-group><article-title>Pharmacy students&#x2019; opinions of using mock questions to prepare for summative examinations</article-title><source>Int J Curr Pharm Sci</source><year>2020</year><month>07</month><volume>12</volume><issue>4</issue><fpage>58</fpage><lpage>65</lpage><pub-id pub-id-type="doi">10.22159/ijcpr.2020v12i4.39079</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alawgali</surname><given-names>SM</given-names> </name></person-group><article-title>An evaluation of a final year multiple choice questions examination at Faculty of medicine-university of Benghazi</article-title><source>Open Access Maced J Med Sci</source><year>2024</year><volume>12</volume><issue>80</issue><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.37376/jsh.vi80.6626</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karthikeyan</surname><given-names>S</given-names> </name><name name-style="western"><surname>O&#x2019;Connor</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>W</given-names> </name></person-group><article-title>Barriers and facilitators to writing quality items for medical school assessments &#x2013; a scoping review</article-title><source>BMC Med Educ</source><year>2019</year><month>12</month><volume>19</volume><issue>1</issue><fpage>123</fpage><pub-id pub-id-type="doi">10.1186/s12909-019-1544-8</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tanaka</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakata</surname><given-names>T</given-names> </name><name name-style="western"><surname>Aiga</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Performance of generative pretrained transformer on the National Medical Licensing Examination in Japan</article-title><source>PLOS Digit Health</source><year>2024</year><month>01</month><volume>3</volume><issue>1</issue><fpage>e0000433</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000433</pub-id><pub-id pub-id-type="medline">38261580</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saowaprut</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wabina</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siriwat</surname><given-names>L</given-names> </name></person-group><article-title>Performance of large language models on Thailand&#x2019;s National Medical Licensing Examination: a cross-sectional study</article-title><source>J Educ Eval Health Prof</source><year>2025</year><volume>22</volume><fpage>16</fpage><pub-id pub-id-type="doi">10.3352/jeehp.2025.22.16</pub-id><pub-id pub-id-type="medline">40354784</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>F</given-names> </name></person-group><article-title>Qwen-2.5 outperforms other large language models in the Chinese National Nursing Licensing Examination: retrospective cross-sectional comparative study</article-title><source>JMIR Med Inform</source><year>2025</year><month>01</month><day>10</day><volume>13</volume><fpage>e63731</fpage><pub-id pub-id-type="doi">10.2196/63731</pub-id><pub-id pub-id-type="medline">39793017</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tomova</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rosell&#x00F3; Atanet</surname><given-names>I</given-names> </name><name name-style="western"><surname>Sehy</surname><given-names>V</given-names> </name><name name-style="western"><surname>Sieg</surname><given-names>M</given-names> </name><name name-style="western"><surname>M&#x00E4;rz</surname><given-names>M</given-names> </name><name name-style="western"><surname>M&#x00E4;der</surname><given-names>P</given-names> </name></person-group><article-title>Leveraging large language models to construct feedback from medical multiple-choice questions</article-title><source>Sci Rep</source><year>2024</year><month>11</month><day>13</day><volume>14</volume><issue>1</issue><fpage>27910</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-79245-x</pub-id><pub-id pub-id-type="medline">39537899</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kondo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Okamoto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kondo</surname><given-names>Y</given-names> </name></person-group><article-title>Pilot study on using large language models for educational resource development in Japanese Radiological Technologist Exams</article-title><source>MedSciEduc</source><year>2025</year><month>04</month><volume>35</volume><issue>2</issue><fpage>919</fpage><lpage>927</lpage><pub-id pub-id-type="doi">10.1007/s40670-024-02251-1</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sabaner</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Hashas</surname><given-names>ASK</given-names> </name><name name-style="western"><surname>Mutibayraktaroglu</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Yozgat</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Klefter</surname><given-names>ON</given-names> </name><name name-style="western"><surname>Subhi</surname><given-names>Y</given-names> </name></person-group><article-title>The performance of artificial intelligence-based large language models on ophthalmology-related questions in Swedish proficiency test for medicine: ChatGPT-4 omni vs Gemini 1.5 Pro</article-title><source>AJO International</source><year>2024</year><month>12</month><volume>1</volume><issue>4</issue><fpage>100070</fpage><pub-id pub-id-type="doi">10.1016/j.ajoint.2024.100070</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mistry</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Saeed</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rafique</surname><given-names>S</given-names> </name><name name-style="western"><surname>Le</surname><given-names>T</given-names> </name><name name-style="western"><surname>Obaid</surname><given-names>H</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>SJ</given-names> </name></person-group><article-title>Large language models as tools to generate radiology board-style multiple-choice questions</article-title><source>Acad Radiol</source><year>2024</year><month>09</month><volume>31</volume><issue>9</issue><fpage>3872</fpage><lpage>3878</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2024.06.046</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Konen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>G</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>How large language models perform on the united states medical licensing examination: a systematic review</article-title><source>medRxiv</source><comment>Preprint posted online on 2023</comment><pub-id pub-id-type="doi">10.1101/2023.09.03.23294842</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cha</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language models in worldwide medical exams: platform development and comprehensive analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>12</month><day>27</day><volume>26</volume><fpage>e66114</fpage><pub-id pub-id-type="doi">10.2196/66114</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roso&#x0142;</surname><given-names>M</given-names> </name><name name-style="western"><surname>G&#x0105;sior</surname><given-names>JS</given-names> </name><name name-style="western"><surname>&#x0141;aba</surname><given-names>J</given-names> </name><name name-style="western"><surname>Korzeniewski</surname><given-names>K</given-names> </name><name name-style="western"><surname>M&#x0142;y&#x0144;czak</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish Medical Final Examination</article-title><source>Sci Rep</source><year>2023</year><month>11</month><day>22</day><volume>13</volume><issue>1</issue><fpage>20512</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-46995-z</pub-id><pub-id pub-id-type="medline">37993519</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JK (Justin</given-names> </name><name name-style="western"><surname>Chua</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lorenzo</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Use of AI (GPT-4)-generated multiple-choice questions for the examination of surgical subspecialty residents</article-title><source>CUAJ</source><year>2025</year><month>Winter</month><volume>19</volume><issue>6</issue><fpage>9020</fpage><pub-id pub-id-type="doi">10.5489/cuaj.9020</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhong</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Potential of multimodal large language models for data mining of medical images and free-text reports</article-title><source>Meta-Radiology</source><year>2024</year><month>12</month><volume>2</volume><issue>4</issue><fpage>100103</fpage><pub-id pub-id-type="doi">10.1016/j.metrad.2024.100103</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soni</surname><given-names>N</given-names> </name><name name-style="western"><surname>Ora</surname><given-names>M</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bathla</surname><given-names>G</given-names> </name></person-group><article-title>A review of the opportunities and challenges with large language models in radiology: the road ahead</article-title><source>AJNR Am J Neuroradiol</source><year>2025</year><month>07</month><day>1</day><volume>46</volume><issue>7</issue><fpage>ajnr</fpage><pub-id pub-id-type="doi">10.3174/ajnr.A8589</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alsabbagh</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Mansour</surname><given-names>T</given-names> </name><name name-style="western"><surname>Al-Kharabsheh</surname><given-names>M</given-names> </name><etal/></person-group><article-title>MiniMedGPT: efficient large vision&#x2013;language model for Medical Visual Question Answering</article-title><source>Pattern Recognit Lett</source><year>2025</year><month>03</month><volume>189</volume><fpage>8</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1016/j.patrec.2025.01.001</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Elhoseiny</surname><given-names>M</given-names> </name></person-group><article-title>MiniGPT-4: enhancing vision-language understanding with advanced large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 2, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.10592</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>InternLM-xcomposer2-4KHD: a pioneering large vision-language model handling resolutions from 336 pixels to 4K HD</article-title><source>Adv Neural Inf Process Syst</source><comment>Preprint posted online on  Apr 9, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.06512</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Fusion side tuning: a parameter and memory efficient fine-tuning method for high-resolution medical image classification</article-title><conf-name>2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>Dec 3-6, 2024</conf-date><conf-loc>Lisbon, Portugal</conf-loc><publisher-name>IEEE</publisher-name><pub-id pub-id-type="doi">10.1109/BIBM62325.2024.10821946</pub-id><pub-id pub-id-type="medline">40989005</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Gemini Team Google</collab><name name-style="western"><surname>Georgiev</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>VI</given-names> </name><etal/></person-group><article-title>Gemini 1.5: unlocking multimodal understanding across millions of tokens of context</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 16, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.05530</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boostani</surname><given-names>M</given-names> </name><name name-style="western"><surname>B&#x00E1;nv&#x00F6;lgyi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goldust</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Diagnostic performance of GPT-4o and Gemini Flash 2.0 in acne and rosacea</article-title><source>Int J Dermatol</source><year>2025</year><month>10</month><volume>64</volume><issue>10</issue><fpage>1881</fpage><lpage>1882</lpage><pub-id pub-id-type="doi">10.1111/ijd.17729</pub-id><pub-id pub-id-type="medline">40064599</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarangi</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Datta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Panda</surname><given-names>BB</given-names> </name><name name-style="western"><surname>Panda</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>H</given-names> </name></person-group><article-title>Evaluating ChatGPT-4&#x2019;s performance in Identifying Radiological Anatomy in FRCR Part 1 Examination Questions</article-title><source>Indian J Radiol Imaging</source><year>2025</year><month>04</month><volume>35</volume><issue>02</issue><fpage>287</fpage><lpage>294</lpage><pub-id pub-id-type="doi">10.1055/s-0044-1792040</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarangi</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Narayan</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Mohakud</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vats</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sahani</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>H</given-names> </name></person-group><article-title>Assessing the capability of ChatGPT, Google Bard, and Microsoft Bing in solving Radiology case vignettes</article-title><source>Indian J Radiol Imaging</source><year>2024</year><month>04</month><volume>34</volume><issue>2</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="doi">10.1055/s-0043-1777746</pub-id><pub-id pub-id-type="medline">38549897</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Morse</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Assessing the potential of USMLE-like exam questions generated by GPT-4</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 28, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.04.25.23288588</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Rizwan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rogoza</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Kwan</surname><given-names>BY</given-names> </name></person-group><article-title>Differentiating between GPT-generated and human-written feedback for radiology residents</article-title><source>Curr Probl Diagn Radiol</source><year>2025</year><volume>54</volume><issue>5</issue><fpage>574</fpage><lpage>578</lpage><pub-id pub-id-type="doi">10.1067/j.cpradiol.2025.02.002</pub-id><pub-id pub-id-type="medline">39984362</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuusemets</surname><given-names>L</given-names> </name><name name-style="western"><surname>Parve</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ain</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kraav</surname><given-names>T</given-names> </name></person-group><article-title>Assessing AI-generated (GPT-4) versus human created MCQs In Mathematics education: a comparative inquiry into vector topics</article-title><source>IJEMST</source><year>2024</year><volume>12</volume><issue>6</issue><fpage>1538</fpage><lpage>1558</lpage><pub-id pub-id-type="doi">10.46328/ijemst.4440</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>C</given-names> </name><name name-style="western"><surname>K&#x00E4;stner</surname><given-names>C</given-names> </name></person-group><article-title>(Why) is my prompt getting worse? Rethinking regression testing for evolving LLM APIs</article-title><conf-name>Proceedings of the IEEE/ACM 3rd International Conference on AI Engineering-Software Engineering for AI</conf-name><conf-date>Apr 14, 2024</conf-date><conf-loc>Lisbon Portugal</conf-loc><fpage>166</fpage><lpage>171</lpage><pub-id pub-id-type="doi">10.1145/3644815.3644950</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schroeder</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wood-Doughty</surname><given-names>Z</given-names> </name></person-group><article-title>Can you trust LLM judgments? Reliability of LLM-as-a-judge</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 18, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.12509</pub-id><pub-id pub-id-type="medline">38076521</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Renze</surname><given-names>M</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Al-Onaizan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bansal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YN</given-names> </name></person-group><article-title>The effect of sampling temperature on problem solving in large language models</article-title><source>Findings of the Association for Computational Linguistics: EMNLP 2024</source><year>2024</year><access-date>2025-08-01</access-date><publisher-name>Association for Computational Linguistics</publisher-name><fpage>7346</fpage><lpage>7356</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.findings-emnlp">https://aclanthology.org/2024.findings-emnlp</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.findings-emnlp.432</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Breakdown of the 2025 Japanese National Exam Questions by Subject.</p><media xlink:href="mededu_v11i1e81807_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Operational definitions and decision rules for item evaluation.</p><media xlink:href="mededu_v11i1e81807_app2.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material></app-group></back></article>