<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e69313</article-id><article-id pub-id-type="doi">10.2196/69313</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Role of Artificial Intelligence in Surgical Training by Assessing GPT-4 and GPT-4o on the Japan Surgical Board Examination With Text-Only and Image-Accompanied Questions: Performance Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Maruyama</surname><given-names>Hiroki</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Toyama</surname><given-names>Yoshitaka</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Takanami</surname><given-names>Kentaro</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Takase</surname><given-names>Kei</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kamei</surname><given-names>Takashi</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Surgery, Tohoku University Graduate School of Medicine</institution><addr-line>Sendai</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Diagnostic Radiology, Tohoku University Hospital</institution><addr-line>1-1 Seiryo-Machi, Aoba-Ku, Sendai, Japan</addr-line><addr-line>Sendai</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Department of Diagnostic Radiology, Tohoku University Graduate School of Medicine</institution><addr-line>Sendai</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sendra-Portero</surname><given-names>Francisco</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Muraoka</surname><given-names>Kosuke</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Pal</surname><given-names>Soumen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to  Yoshitaka Toyama, MD, PhD, Department of Diagnostic Radiology, Tohoku University Hospital, 1-1 Seiryo-Machi, Aoba-Ku, Sendai, Japan, Sendai, 980-8575, Japan, 81 227177312; <email>ytoyama0818@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>30</day><month>7</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e69313</elocation-id><history><date date-type="received"><day>02</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>07</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>01</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Hiroki Maruyama, Yoshitaka Toyama, Kentaro Takanami, Kei Takase, Takashi Kamei. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 30.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e69313"/><abstract><sec><title>Background</title><p>Artificial intelligence and large language models (LLMs)&#x2014;particularly GPT-4 and GPT-4o&#x2014;have demonstrated high correct-answer rates in medical examinations. GPT-4o has enhanced diagnostic capabilities, advanced image processing, and updated knowledge. Japanese surgeons face critical challenges, including a declining workforce, regional health care disparities, and work-hour-related challenges. Nonetheless, although LLMs could be beneficial in surgical education, no studies have yet assessed GPT-4o&#x2019;s surgical knowledge or its performance in the field of surgery.</p></sec><sec><title>Objective</title><p>This study aims to evaluate the potential of GPT-4 and GPT-4o in surgical education by using them to take the Japan Surgical Board Examination (JSBE), which includes both textual questions and medical images&#x2014;such as surgical and computed tomography scans&#x2014;to comprehensively assess their surgical knowledge.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used 297 multiple-choice questions from the 2021&#x2010;2023 JSBEs. The questions were in Japanese, and 104 of them included images. First, the GPT-4 and GPT-4o responses to only the textual questions were collected via OpenAI&#x2019;s application programming interface to evaluate their correct-answer rate. Subsequently, the correct-answer rate of their responses to questions that included images was assessed by inputting both text and images.</p></sec><sec sec-type="results"><title>Results</title><p>The overall correct-answer rates of GPT-4o and GPT-4 for the text-only questions were 78% (231/297) and 55% (163/297), respectively, with GPT-4o outperforming GPT-4 by 23% (<italic>P</italic>=&#x003C;.01). By contrast, there was no significant improvement in the correct-answer rate for questions that included images compared with the results for the text-only questions.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4o outperformed GPT-4 on the JSBE. However, the results of the LLMs were lower than those of the examinees. Despite the capabilities of LLMs, image recognition remains a challenge for them, and their clinical application requires caution owing to the potential inaccuracy of their results.</p></sec></abstract><kwd-group><kwd>LLM</kwd><kwd>ChatGPT</kwd><kwd>Japan Surgical Board Examination</kwd><kwd>surgical education</kwd><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>Medical Licensing Examination</kwd><kwd>diagnostic imaging</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Surgical training requires a considerable time commitment, as it includes various educational activities, on-the-job training, and supervised clinical experience [<xref ref-type="bibr" rid="ref1">1</xref>]. In Japan, the surgery field is facing many challenges, such as the declining numbers of surgeons, regional health care disparities [<xref ref-type="bibr" rid="ref2">2</xref>], and working-hour&#x2013;related challenges [<xref ref-type="bibr" rid="ref3">3</xref>]. Consequently, it is important to understand whether new technologies such as artificial intelligence (AI) and large language models (LLMs) can augment surgery education and training [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>LLMs are AI systems trained on billions of words from papers, books, and other internet sources. ChatGPT&#x2014;released by OpenAI in November 2022&#x2014;is a generative AI chatbot that supports multimodal inputs and text generation, with a GPT as its backend [<xref ref-type="bibr" rid="ref5">5</xref>]. ChatGPT has achieved conversational interactivity and human-like or better correct-answer rate across various fields&#x2014;including the medical field [<xref ref-type="bibr" rid="ref6">6</xref>]&#x2014;suggesting that LLM applications could be beneficial in clinical, educational, and research settings [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>GPT-4&#x2014;released in March 2023&#x2014;achieved an excellent correct-answer rate for United States Medical Licensing Examination (USMLE)-style questions, exceeding the passing threshold of 60% [<xref ref-type="bibr" rid="ref8">8</xref>]. Moreover, in the field of surgery, GPT-3.5 obtained a 65% correct-answer rate for the US General Surgery Specialist Examination [<xref ref-type="bibr" rid="ref9">9</xref>], and GPT-4 achieved a 76% correct-answer rate for the Korean Surgical Specialist Examination [<xref ref-type="bibr" rid="ref10">10</xref>]. However, GPT-4 does not include an image-recognition function; consequently, questions that included images were excluded from both of these studies. To the best of our knowledge, no previous study has yet evaluated the correct-answer rate of LLMs on the Japan Surgical Board Examination (JSBE).</p><p>GPT-4-Vision (GPT-4V)&#x2014;an improved version of GPT-4 with image-processing capabilities [<xref ref-type="bibr" rid="ref6">6</xref>]&#x2014;can process and interpret images along with text data, extending its potential application to areas that require image analysis. When both text- and image-based questions from the USMLE were input into GPT-4V, its correct-answer rate improved from 83.6% to 90.7% [<xref ref-type="bibr" rid="ref11">11</xref>]. However, there have been no reports on the functional evaluation of AI in the field of surgery that includes image evaluations.</p><p>GPT-4 Omni (GPT-4o)&#x2014;released in May 2024&#x2014;features a considerably faster processing speed than GPT-4 and includes many upgrades, such as its improved non-English-language processing and enhanced visual and speech understanding [<xref ref-type="bibr" rid="ref12">12</xref>]. Additionally, the GPT-4o knowledge base has been updated with data up to October 2023, enabling it to offer more accurate answers based on recent information and accurate text generation [<xref ref-type="bibr" rid="ref13">13</xref>]. Several reports have evaluated the performance of Chat-GPT4o using medical examinations, but only 3 reports have evaluated the effectiveness of image input in addition to text [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. Moreover, no study has yet evaluated it in the field of surgery. In many cases, diagnostic imaging plays an important role in surgical treatment plans, and specific images&#x2014;such as intraoperative imaging findings&#x2014;are sometimes used. Consequently, evaluating how LLMs handle surgery-specific images is critical for understanding their current capabilities. If LLMs have a high level of knowledge related to surgery-specific images, they have the potential to be effective tools in real clinical practice and surgical education.</p><p>There have been few reports evaluating the extent to which LLMs, such as GPT, possess surgical knowledge, particularly in relation to interpreting surgical images&#x2014;a skill essential for clinical decision-making. This study aims to assess and compare the performance of GPT-4 and GPT-4o on JSBE, focusing not only on general surgical knowledge but also on image recognition and diagnostic accuracy. We examined the models&#x2019; responses to text-only and text-with-image questions using a retrospective evaluation design. We hypothesized that GPT-4o would outperform GPT-4, particularly on image-based questions. The findings of this study should be useful for medical educators and AI researchers seeking to understand the capabilities and limitations of LLMs in surgical education and training.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Question Dataset</title><p>This study used multiple-choice questions from the 2021&#x2010;2023 JSBE published by the Japan Surgical Society. Each question had five possible choices, with some requiring a single answer and others requiring two. The responses for the two-answer questions were deemed correct only if both correct answers were selected. The number of answers required was specified in the input text. Electronic versions of previous papers that were available for sale were also used. The Japan Surgical Society granted permission to answer these questions.</p><p>The JSBE is a multidisciplinary surgical knowledge examination designed for senior resident doctors in Japan who have completed a 3-year surgical training program. The number of examinees, successful candidates, and pass rates are listed in <xref ref-type="table" rid="table1">Table 1</xref>. There were 100 questions in each year, but 1 question in 2022 and 2 questions in 2023 were excluded as inappropriate questions, so a total of 297 questions were used in this study. The questions were presented in Japanese. To evaluate and compare responses to text-with-image questions, text-only questions were also included in the study. The text, obtained from an electronic question booklet, was entered into the models in an extensive markup language (XML) format. Moreover, screenshots of the test images were obtained from the booklet and saved in JPEG format, with their captions also being included.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Annual test results of the Japan Surgical Board Examination.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Examinees</td><td align="left" valign="bottom">Successful examinees</td><td align="left" valign="bottom">Pass rate (%)</td><td align="left" valign="bottom">Correct-answer rate (%)</td></tr></thead><tbody><tr><td align="left" valign="top">2021</td><td align="left" valign="top">289</td><td align="left" valign="top">261</td><td align="left" valign="top">90.3</td><td align="left" valign="top">84.2</td></tr><tr><td align="left" valign="top">2022</td><td align="left" valign="top">1594</td><td align="left" valign="top">1534</td><td align="left" valign="top">96.2</td><td align="left" valign="top">92.7</td></tr><tr><td align="left" valign="top">2023</td><td align="left" valign="top">835</td><td align="left" valign="top">814</td><td align="left" valign="top">97.5</td><td align="left" valign="top">92.7</td></tr></tbody></table></table-wrap><p>Questions with multiple images were exported and combined into a single image (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The correct answers were also obtained from the electronic question booklet. The percentage of correct answers for each topic was calculated based on the number of correct answers provided by actual examinees for each question.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Collection of data from JSBE and input into GPT models. The questions were entered into an electronic booklet in Japanese. The images were saved as screenshots and input into ChatGPT-4o. JSBE: Japan Surgical Board Examination.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e69313_fig01.png"/></fig></sec><sec id="s2-2"><title>Question Classification</title><p>The questions were classified based on whether they included images. Of the 297 questions, 104 included images (text-with-image questions), and the remaining 193 were text-only questions. They were grouped into 6 categories&#x2014;that is, gastrointestinal surgery (134/297, 45.1%), cardiovascular surgery (44/297, 14.8%), thoracic surgery (30/297, 10.1%), pediatric surgery (30/297, 10.1%), breast and endocrine surgery (30/297, 10.1%), and emergency anesthesiology (29/297, 9.8%). The number of image modalities and images per question was as follows: the highest number of questions included computed tomography (CT) images (44/104, 42.3%), followed by endoscopy (15/104, 14.4%), and ultrasound (13/104, 12.5%) images. Additionally, X-ray (10/104, 9.6%), radiofluoroscopy (10/104, 9.6%), magnetic resonance imaging (MRI; 8/104, 7.7%), surface of skin findings (8/104, 7.7%), positron emission tomography (6/104, 5.8%), intraoperative findings (5/104, 4.8%), pathology (5/104, 4.8%), and other modality images (5/104, 4.8%) were also included. Furthermore, 44 out of 104 questions included 1 image (42.3%), 42/104 included 2 images (40.4%), 11/104 included 3 images (10.6%), 6/104 included 4 images (5.8%), and 1/104 included 6 images (which was the maximum; 1.0%). The percentage of correct answers for each topic was calculated based on the percentage of correct answers to the individual questions and was then compared with the percentages of correct answers provided by both GPT-4 and GPT-4o.</p></sec><sec id="s2-3"><title>Data Collection and Assessment</title><p>We used the GPT-4 and GPT-4o models via OpenAI&#x2019;s application programming interface (API) without additional fine-tuning or custom configuration. All parameters were maintained at their default setting. No pretraining or fine-tuning was conducted, and no custom persona was provided. The questions were submitted via the OpenAI API in June 2024, and the GPT-4 and GPT-4o responses were collected. The internal GPT-4o and GPT-4 versions used in this study were gpt-4o-2024-05-13 and gpt-4-turbo-2024-04-09, respectively. GPT-4o was trained on data up to October 2023, whereas GPT-4 was trained on data up to December 2023 [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>A maximum token limit of 4096 tokens was assumed, consistent with the default for many GPT-based API end points. All other parameters&#x2014;for example, the temperature, top_p, and frequency penalty&#x2014;were kept at their default settings.</p><p>All test questions were presented in Japanese. To ensure consistent model behavior and clear response formatting, the following English-language prompt was placed before each question: &#x201C;Please answer the following question. Indicate the symbol of the option selected by you at the end.&#x201D; This prompt was immediately followed by the question text in Japanese. This structure aimed to maintain response consistency across test items. The prompting strategy primarily followed a zero-shot format.</p><p>Questions with images were assessed twice&#x2014;that is, once with and once without images. The answers that matched those in the question booklet were considered correct. Moreover, the percentage of correct answers was calculated for each image modality with the questions, and the percentage of correct answers was calculated based on the number of images included in the question. The percentage of correct answers for GPT-4o and GPT-4 were compared for all questions, text-only questions, and text-with-image questions, by the question category, image modality used, and number of images. For category-by-category comparisons, only the results for questions with image inputs were compared, but for the other items, the results with and without image inputs were also compared.</p></sec><sec id="s2-4"><title>Statistical Analyses</title><p>McNemar test was used to compare the proportion of correct responses between the GPT-4 and GPT-4o. Fisher exact test was used for each category&#x2014;that is, with or without images, lower-order thinking versus higher-order thinking, and 2 answers versus 1 answer&#x2014;to assess the GPT-4o correct-answer rate for each category. Additionally, a chi-square test was conducted to compare grades across topics. All tests were 2-tailed, and <italic>P</italic> values&#x003C;.05 were considered significant. All <italic>P</italic> values were nominal and were not corrected for multiple comparisons. The statistical analyses were conducted using JMP Pro 17.0 (SAS Institute Inc).</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study did not include human participants or patient data. All the data used in this study are publicly available. Therefore, it was excluded from review by the Institutional Review Board of Tohoku University (IRB number 11000629).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Correct-Answer Rates of GPT-4 and GPT-4o</title><p>Of the 297 questions used for the text-input only test, GPT-4 answered 164 (55%) correctly and GPT-4o answered 225 (76%) correctly; thus, GPT-4o outperformed GPT-4 by 21% (<italic>P</italic>&#x003C;.001). Additionally, when image inputs were performed for the 104 text-with-image questions out of the 297 questions, GPT-4o outperformed GPT-4 by 23% (<italic>P</italic>&#x003C;.001), providing 231 (78%) and 163 (55%) correct answers, respectively. Comparisons of their correct-answer rates for individual groups showed that GPT-4o provided significantly more correct answers for image-based questions (GPT-4o 67% vs GPT-4 45%; <italic>P</italic>=&#x003C;.002), text-only questions (83% vs 61%; <italic>P</italic>=&#x003C;.001), digestive surgery (70% vs 42%; <italic>P</italic>=&#x003C;.001), cardiovascular surgery (98% vs 68%; <italic>P</italic>&#x003C;.00031), and breast and endocrine surgery (93% vs 67%; <italic>P</italic>=.0047). However, no significant differences (GPT-4o vs GPT-4) were evident between their correct-answer rates for questions related to thoracic surgery (63% vs 57%; <italic>P</italic>=.41), emergency surgery and anesthesia (86% vs 66%; <italic>P</italic>=.06), and pediatric surgery (73% vs 70%; <italic>P</italic>=.71). Notably, GPT-4o provided more correct responses than GPT-4 (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>GPT-4 and GPT-4o correct-answer rates for the Japan Surgical Board Examination.<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question type</td><td align="left" valign="bottom">Number of questions</td><td align="left" valign="bottom">Image input</td><td align="left" valign="bottom" colspan="2">Correct-answer rate</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">GPT-4, n (%)</td><td align="left" valign="top">GPT-4o, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">All questions</td><td align="left" valign="top">297</td><td align="left" valign="top">&#x2013;</td><td align="left" valign="top">164 (55)</td><td align="left" valign="top">225 (76)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">+</td><td align="left" valign="top">163 (55)</td><td align="left" valign="top">231 (78)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top">Text-with-image questions</td><td align="left" valign="top">104</td><td align="left" valign="top">&#x2013;</td><td align="left" valign="top">46 (44)</td><td align="left" valign="top">64 (62)</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">+</td><td align="left" valign="top">47 (45)</td><td align="left" valign="top">70 (67)</td><td align="left" valign="top">.&#x003C;001</td></tr><tr><td align="left" valign="top">Text-only questions</td><td align="left" valign="top">193</td><td align="left" valign="top">&#x2013;</td><td align="left" valign="top">118 (61)</td><td align="left" valign="top">161 (83)</td><td align="left" valign="top">.&#x003C;001</td></tr><tr><td align="left" valign="top" colspan="6">Topic</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Digestive surgery</td><td align="left" valign="top">134</td><td align="left" valign="top">+</td><td align="left" valign="top">56 (42)</td><td align="left" valign="top">94 (70)</td><td align="left" valign="top">.&#x003C;001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cardiovascular surgery</td><td align="left" valign="top">44</td><td align="left" valign="top">+</td><td align="left" valign="top">30 (68)</td><td align="left" valign="top">43 (98)</td><td align="left" valign="top">.&#x003C;001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Thoracic surgery</td><td align="left" valign="top">30</td><td align="left" valign="top">+</td><td align="left" valign="top">17 (57)</td><td align="left" valign="top">19 (63)</td><td align="left" valign="top">.41</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pediatric surgery</td><td align="left" valign="top">30</td><td align="left" valign="top">+</td><td align="left" valign="top">21 (70)</td><td align="left" valign="top">22 (73)</td><td align="left" valign="top">.71</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Breast and endocrine surgery</td><td align="left" valign="top">30</td><td align="left" valign="top">+</td><td align="left" valign="top">20 (67)</td><td align="left" valign="top">28 (93)</td><td align="left" valign="top">.005</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency and anesthesia</td><td align="left" valign="top">29</td><td align="left" valign="top">+</td><td align="left" valign="top">19 (66)</td><td align="left" valign="top">25 (86)</td><td align="left" valign="top">.06</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Data are presented as the number of correct answers.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>GPT-4o&#x2019;s Correct-Answer Rate on Text-With-Image Questions Compared With its Rate on Text-Only Questions</title><p>Even when image inputs were used for the text-with-image questions, GPT-4o provided 67% correct answers, compared to 83% for the text-only questions, indicating a statistically significant difference (<italic>P</italic>&#x003C;.002).</p></sec><sec id="s3-3"><title>Correct-Answer Rate With and Without Image Input</title><p>The percentages of correct responses provided by both models with and without image inputs were compared for 104 text-with-image questions&#x2014;here, GPT-4o provided correct-answer rates of 62% and 67% with and without image inputs (<italic>P</italic>=.2), respectively, whereas GPT-4 provided correct-answer rates of 44% and 45% with and without image inputs (<italic>P</italic>=.86; <xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>GPT-4 and GPT-4o correct-answer rates based on image-input and no image-input questions.<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Large language model</td><td align="left" valign="bottom">Input image, n (%)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">No input image, n (%)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">47 (45)</td><td align="left" valign="top">46 (44)</td><td align="left" valign="top">.86</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">70 (67)</td><td align="left" valign="top">64 (62)</td><td align="left" valign="top">.20</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Data are presented as the number of correct answers. Values in parentheses indicate the percentage of correct responses.</p></fn><fn id="table3fn2"><p><sup>b</sup>The percentage indicates the percentage of correct answers to the 104 text-with-image questions.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Correct-Answer Rate Comparison of GPT-4 and GPT-4o by Category</title><p>GPT-4 provided the highest percentage of correct answers for pediatric-surgery questions (70%) and the lowest for gastrointestinal-surgery questions (19%; <italic>P</italic>=.0027). By contrast, GPT-4o provided the highest percentage of correct answers for cardiovascular-surgery questions (98%) and the lowest for thoracic-surgery questions (63%; <italic>P</italic>=.002). The correct-answer rate for the examinees referred to here is the correct-answer rate for all examinees from 2021 to 2023 (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>GPT-4o, GPT-4, and examinees&#x2019; correct-answer rates across various categories.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Topic</td><td align="left" valign="bottom" rowspan="2">Number of questions</td><td align="left" valign="bottom" rowspan="2">Text-with-image questions, n (%)</td><td align="left" valign="bottom" colspan="3">Correct-answer rate (%)</td></tr><tr><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">GPT-4o</td><td align="left" valign="bottom">Examinees</td></tr></thead><tbody><tr><td align="left" valign="top">All questions</td><td align="left" valign="top">297</td><td align="left" valign="top">104 (35)</td><td align="left" valign="top">55</td><td align="left" valign="top">78</td><td align="left" valign="top">90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Digestive surgery</td><td align="left" valign="top">134</td><td align="left" valign="top">43 (32)</td><td align="left" valign="top">42</td><td align="left" valign="top">70</td><td align="left" valign="top">89</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cardiovascular surgery</td><td align="left" valign="top">44</td><td align="left" valign="top">19 (43)</td><td align="left" valign="top">68</td><td align="left" valign="top">98</td><td align="left" valign="top">91</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Thoracic surgery</td><td align="left" valign="top">30</td><td align="left" valign="top">17 (57)</td><td align="left" valign="top">57</td><td align="left" valign="top">63</td><td align="left" valign="top">88</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pediatric surgery</td><td align="left" valign="top">30</td><td align="left" valign="top">11 (37)</td><td align="left" valign="top">70</td><td align="left" valign="top">73</td><td align="left" valign="top">92</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Breast and endocrine surgery</td><td align="left" valign="top">30</td><td align="left" valign="top">7 (23)</td><td align="left" valign="top">67</td><td align="left" valign="top">93</td><td align="left" valign="top">90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency and anesthesia</td><td align="left" valign="top">29</td><td align="left" valign="top">7 (24)</td><td align="left" valign="top">66</td><td align="left" valign="top">86</td><td align="left" valign="top">91</td></tr><tr><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">.003</td><td align="left" valign="top">.&#x003C;001</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Comparison of GPT-4 and GPT-4o Responses by Image Modality and Number of Figures</title><p>Using the text-with-image questions, the correct-answer rates for GPT-4 and GPT-4o were compared using various imaging modalities and images. GPT-4 provided the highest percentage of correct answers for questions on radiofluoroscopy and inspection (70% and 75%, respectively), whereas GPT-4o provided the highest percentage of correct answers for radiofluoroscopy and ultrasound (80% and 92%, respectively). By contrast, the correct-answer rates of the models were low for questions that included intraoperative and pathological findings&#x2014;that is, they were 20% and 40% for GPT-4, respectively, and 40% for GPT-4o for both intraoperative and pathological findings. Moreover, a weak negative correlation was evident between the number of images and the percentage of correct answers, but it was not statistically significant (<xref ref-type="table" rid="table5">Table 5</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Correct-answer rate comparisons based on imaging modality and number of images.<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variables</td><td align="left" valign="bottom">n</td><td align="left" valign="bottom" colspan="4">Correct-answer rate, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top" colspan="2">GPT-4</td><td align="left" valign="top" colspan="2">GPT-4o</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Image input +</td><td align="left" valign="top">Image input &#x2212;</td><td align="left" valign="top">Image input +</td><td align="left" valign="top">Image input &#x2212;</td></tr><tr><td align="left" valign="top" colspan="6">Imaging modality</td></tr><tr><td align="left" valign="top">&#x2003;Text-with-image questions</td><td align="left" valign="top">104</td><td align="left" valign="top">47 (45)</td><td align="left" valign="top">46 (44)</td><td align="left" valign="top">70 (67)</td><td align="left" valign="top">64 (62)</td></tr><tr><td align="left" valign="top">&#x2003;XP<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">10</td><td align="left" valign="top">5 (50)</td><td align="left" valign="top">4 (40)</td><td align="left" valign="top">6 (60)</td><td align="left" valign="top">5 (50)</td></tr><tr><td align="left" valign="top">&#x2003;Radiofluoroscopy</td><td align="left" valign="top">10</td><td align="left" valign="top">7 (70)</td><td align="left" valign="top">6 (60)</td><td align="left" valign="top">8 (80)</td><td align="left" valign="top">6 (60)</td></tr><tr><td align="left" valign="top">&#x2003;Ultrasound</td><td align="left" valign="top">13</td><td align="left" valign="top">6 (46)</td><td align="left" valign="top">8 (62)</td><td align="left" valign="top">12 (92)</td><td align="left" valign="top">10 (77)</td></tr><tr><td align="left" valign="top">&#x2003;CT<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">60</td><td align="left" valign="top">25 (42)</td><td align="left" valign="top">27 (45)</td><td align="left" valign="top">39 (65)</td><td align="left" valign="top">37 (62)</td></tr><tr><td align="left" valign="top">&#x2003;MRI<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">8</td><td align="left" valign="top">5 (63)</td><td align="left" valign="top">2 (25)</td><td align="left" valign="top">5 (63)</td><td align="left" valign="top">4 (50)</td></tr><tr><td align="left" valign="top">&#x2003;PET<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">6</td><td align="left" valign="top">4 (67)</td><td align="left" valign="top">2 (33)</td><td align="left" valign="top">3 (50)</td><td align="left" valign="top">2 (33)</td></tr><tr><td align="left" valign="top">&#x2003;Endoscopy</td><td align="left" valign="top">15</td><td align="left" valign="top">9 (60)</td><td align="left" valign="top">8 (53)</td><td align="left" valign="top">9 (60)</td><td align="left" valign="top">11 (73)</td></tr><tr><td align="left" valign="top">&#x2003;Surface of skin findings</td><td align="left" valign="top">8</td><td align="left" valign="top">6 (75)</td><td align="left" valign="top">4 (50)</td><td align="left" valign="top">5 (63)</td><td align="left" valign="top">6 (75)</td></tr><tr><td align="left" valign="top">&#x2003;Intraoperative findings</td><td align="left" valign="top">5</td><td align="left" valign="top">1 (20)</td><td align="left" valign="top">2 (40)</td><td align="left" valign="top">2 (40)</td><td align="left" valign="top">3 (60)</td></tr><tr><td align="left" valign="top">&#x2003;Pathology</td><td align="left" valign="top">5</td><td align="left" valign="top">2 (40)</td><td align="left" valign="top">2 (40)</td><td align="left" valign="top">2 (40)</td><td align="left" valign="top">2 (40)</td></tr><tr><td align="left" valign="top">&#x2003;Other</td><td align="left" valign="top">5</td><td align="left" valign="top">2 (40)</td><td align="left" valign="top">3 (60)</td><td align="left" valign="top">5 (100)</td><td align="left" valign="top">4 (80)</td></tr><tr><td align="left" valign="top" colspan="6">Number of figures</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">44</td><td align="left" valign="top">19 (43)</td><td align="left" valign="top">18 (41)</td><td align="left" valign="top">28 (64)</td><td align="left" valign="top">26 (59)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">42</td><td align="left" valign="top">18 (43)</td><td align="left" valign="top">19 (45)</td><td align="left" valign="top">32 (76)</td><td align="left" valign="top">27 (64)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">11</td><td align="left" valign="top">7 (64)</td><td align="left" valign="top">8 (73)</td><td align="left" valign="top">10 (91)</td><td align="left" valign="top">9 (82)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">6</td><td align="left" valign="top">3 (50)</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">1 (17)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>6</td><td align="left" valign="top">1</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Data are presented as the number of correct answers. Values in parentheses indicate the percentage of correct responses.</p></fn><fn id="table5fn2"><p><sup>b</sup>XP: X-ray photograph.</p></fn><fn id="table5fn3"><p><sup>c</sup>CT: computed tomography.</p></fn><fn id="table5fn4"><p><sup>d</sup>MRI: magnetic resonance imaging.</p></fn><fn id="table5fn5"><p><sup>e</sup>PET: positron emission tomography.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>GPT-4o significantly outperformed GPT-4 across all evaluated categories. However, neither GPT-4 nor GPT-4o achieved examinee-level accuracy for any question (<xref ref-type="table" rid="table1">Table 1</xref>). The correct-answer rates for text-only questions were higher than those for text-with-image questions for both models. Moreover, the inclusion of image inputs did not lead to a significant improvement in performance on text-with-image questions (<xref ref-type="table" rid="table3">Table 3</xref>). Performance varied by image type, with particularly low correct-answer rates for questions involving intraoperative and pathological images. By contrast, the correct-answer rates were relatively higher for radiological images such as CT and MRI images.</p><p>The results showed that there was no significant difference in the percentage of correct responses between GPT-4 and GPT-4o for thoracic surgery, emergency and anesthesia, and pediatric surgery. When comparing GPT-4o to the results of examinees, both demonstrated similarly low correct-answer rates for questions related to thoracic and gastrointestinal surgery. There was no consistent pattern evident in terms of which surgical category exhibited the highest correct response rate.</p></sec><sec id="s4-2"><title>Additional Analysis by Problem Type</title><p>In terms of category-specific differences, the percentage of correct responses for both GPT-4 and GPT-4o did not differ significantly for thoracic, emergency and anesthesia, and pediatric surgery. A comparison of GPT-4o and examinee correct-answer rates demonstrated similarly low correct response rates for questions related to the thoracic and gastrointestinal surgeries. However, no consistent trend was evident with the highest percentage of correct responses. The correct-answer rate was low for thoracic surgery because questions in this field comprised a high proportion of text-with-image questions. By contrast, it was high for breast and endocrine surgery and emergency and anesthesiology, which comprised fewer text-with-image questions. However, the opposite trend was evident for cardiovascular surgery, where no consistent trend was evident in the correct-answer rates for text-with-image questions. This result could be attributed to the fact that many of the questions could be answered correctly without image recognition, or that many of the images were easy to understand even when image recognition was required.</p></sec><sec id="s4-3"><title>Responses to Intraoperative Imaging Problems</title><p>An additional study on the correct-answer rate was conducted to assess the differences in the GPT model responses based on the imaging modality. This is only a hypothesis, but owing to the small sample size, as 1 question contained several types of images, the correct-answer rate was more than 20% lower than the overall GPT-4 correct-answer rates for questions involving intraoperative findings and those of GPT-4o for intraoperative and pathological images compared with those for radiological modalities such as CT and MRI (<xref ref-type="table" rid="table4">Table 4</xref>). Only questions involving intraoperative findings had a correct-answer rate after image input that was more than 20% lower than the average for both GPT-4 and GPT-4o, which could be considered to be a GPT image-recognition weakness. Additionally, the responses to intraoperative images were evaluated individually (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Although liver resection was identified in intraoperative liver-resection images, the actual resection was misidentified. Moreover, in images of mediastinal tumors, the tumor and recurrent nerve were either not mentioned or could not be identified, whereas from the intraoperative inguinal-hernia images, the arteriovenous vein in the inferior abdominal wall was misidentified as the vas deferens.</p></sec><sec id="s4-4"><title>Implications of Findings</title><p>From this research, it is evident that GPT-4o significantly outperformed GPT-4 across all evaluated categories, indicating that OpenAI&#x2019;s model development is progressing steadily. However, despite these improvements, neither GPT-4 nor GPT-4o achieved the correct-answer rates of actual examinees. This highlights that current LLMs, while advancing rapidly, still fall short of the reliability required for high-stakes clinical decision-making or licensing-level assessments. In particular, GPT-4o exhibited lower accuracy on text-with-image questions compared to text-only questions, and the inclusion of image inputs did not significantly improve its performance. This reflects an ongoing limitation in the image recognition capabilities of LLMs, especially for complex visuals such as intraoperative and pathological images, and suggests that caution is warranted when considering these models for clinical use.</p><p>However, this study was conducted without pre-tuning, and the accuracy of LLMs could be potentially improved by tuning them in a field-specific manner [<xref ref-type="bibr" rid="ref7">7</xref>]. Pretuning them on data from medical textbooks and previous examinations could enhance the relevance and accuracy of their responses. Pretraining has the potential to improve model performance, but the process can be complicated and is not supported by some models. The results of the models that did not undergo pretraining can be said to be results that can be applied to general readers and various models.</p><p>A &#x201C;Socratic tutor mode&#x201D; educational application of GPT-4o has been reported, wherein the complexity of medical questions can be changed during the conversation based on the learner&#x2019;s understanding [<xref ref-type="bibr" rid="ref7">7</xref>]. In this study, GPT-4o provided a high percentage of correct answers to text-only questions, which could be used for learning guideline content and obtaining general surgical knowledge, where the answers are clear to residents and majors studying surgery. Additionally, if its image-recognition capabilities improve in the future and it becomes possible to diagnose intraoperative images specific to surgery with a high degree of confidence, it could become a useful indicator when making decisions in daily clinical practice.</p></sec><sec id="s4-5"><title>Comparison to the Literature</title><p>Previous studies have shown that the GPT-4 correct-answer rate could be improved for USMLE by using images to complement the text input [<xref ref-type="bibr" rid="ref11">11</xref>]. However, similar to our study, researchers have reported that inputting image information did not increase the percentage of correct answers [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. Additionally, a previous study has suggested that GPT-4 prioritizes verbal information over images [<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>Previous reports [<xref ref-type="bibr" rid="ref23">23</xref>] have shown that ChatGPT is able to provide more accurate responses when given English-language input than non-English-language input, but it has also been shown that the correct-answer rate of GPT-4o has improved when given Japanese-language inputs [<xref ref-type="bibr" rid="ref12">12</xref>]. Moreover, the results of this study reflect this fact, which is consistent with the findings of a previous study on radiology [<xref ref-type="bibr" rid="ref14">14</xref>], highlighting GPT-4o&#x2019;s enhanced reasoning and better responsiveness to Japanese inputs, thereby successfully addressing the limitations of earlier variants.</p></sec><sec id="s4-6"><title>Strengths and Limitations</title><p>It should be noted that, unlike previous studies which relied on recalled questions or researcher-derived answers [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], commercially available past examinations were used in this study to ensure a more accurate and reliable assessment, making it a strong point of our research. Nonetheless, this study had several limitations. First, the LLMs were only asked each question once; however, LLMs are generative models, often referred to as &#x201C;probabilistic parroting&#x201D; models [<xref ref-type="bibr" rid="ref24">24</xref>]. This is because they generate answers based on the probability of selecting the most appropriate word from the training data. Consequently, different answers can be returned for the same question with a certain probability when asked multiple times [<xref ref-type="bibr" rid="ref25">25</xref>]. To address this problem, it is necessary to ask the same question multiple times and assess the degree to which the answers fluctuate. Second, ChatGPT responses can be interspersed with answers based on false evidence or factual errors, commonly referred to as &#x201C;hallucinations,&#x201D; which is the phenomenon of asserting incorrect content as if it were correct [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Even though such responses can be determined to be false by specialists, they can confuse doctors during training. This phenomenon occurs even as the correct-answer rate of the model improves&#x2014;that is, the greater the confidence in responses, the more difficult it can become to identify incorrect information. Third, LLMs&#x2014;including ChatGPT&#x2014;are updated periodically, which could alter their correct-answer rate or incur unexpected pretraining as the questions are entered. Consequently, the reproducibility of test results in future studies remains uncertain. Finally, we used a relatively small number of questions, which could have resulted in an inadequate analysis, particularly for the category-specific correct-answer rate. The differences in the correct-answer rate between GPT-4o and GPT-4 were substantial in the fields of cardiovascular surgery, digestive surgery, and breast and endocrine surgery; however, differences in other fields were minimal, which could be attributed to the limited sample size. If the JSBE is able to obtain more high-quality problems as it continues to hold more events, it could be possible to evaluate models with an even greater correct-answer rate.</p><p>In conclusion, GPT-4o outperformed GPT-4 for the JSBE. Although there is still room for improvement in image recognition and clinical applications, which should be approached with caution, the results suggest that improved models and pretraining could provide LLMs with more accurate medical knowledge and enhance their clinical judgment, which could be useful in enhanced learning for surgeons.</p></sec></sec></body><back><ack><p>The authors thank the Japan Surgical Society for granting permission to use the official Japan Surgical Board Examination&#x2019;s well-thought-out and high-quality questions for this study.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb4">JSBE</term><def><p>Japan Surgical Board Examination</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb7">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Debas</surname><given-names>HT</given-names> </name><name name-style="western"><surname>Bass</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Brennan</surname><given-names>MF</given-names> </name><etal/></person-group><article-title>American Surgical Association Blue Ribbon Committee Report on Surgical Education: 2004</article-title><source>Ann Surg</source><year>2005</year><month>01</month><volume>241</volume><issue>1</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1097/01.sla.0000150066.83563.52</pub-id><pub-id pub-id-type="medline">15621984</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Overview of statistics on doctors, dentists [Article in Japanese]</article-title><source>Ministry of Health Labour and Welfare</source><year>2024</year><access-date>2025-07-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mhlw.go.jp/toukei/saikin/hw/ishi/22/index.html">https://www.mhlw.go.jp/toukei/saikin/hw/ishi/22/index.html</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Work style reform for doctors [Article in Japanese]</article-title><source>Ministry of Health, Labour and Welfare</source><year>2024</year><access-date>2025-07-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mhlw.go.jp/content/10800000/001129457.pdf">https://www.mhlw.go.jp/content/10800000/001129457.pdf</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Varas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Coronel</surname><given-names>BV</given-names> </name><name name-style="western"><surname>Villagr&#x00E1;n</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Innovations in surgical training: exploring the role of artificial intelligence and large language models (LLM)</article-title><source>Rev Col Bras Cir</source><year>2023</year><volume>50</volume><fpage>e20233605</fpage><pub-id pub-id-type="doi">10.1590/0100-6991e-20233605-en</pub-id><pub-id pub-id-type="medline">37646729</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><article-title>ChatGPT</article-title><source>Open AI</source><year>2024</year><access-date>2025-07-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/chatgpt/">https://openai.com/chatgpt/</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Open AI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv. Preprint posted online on Mar 4, 2024</source><access-date>2025-07-25</access-date><comment>Preprint posted online on  Mar 4, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2303.08774">https://arxiv.org/pdf/2303.08774</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tran</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sherman</surname><given-names>SK</given-names> </name><name name-style="western"><surname>De Andrade</surname><given-names>JP</given-names> </name></person-group><article-title>Performance of ChatGPT on American Board of Surgery In-Training Examination preparation questions</article-title><source>J Surg Res</source><year>2024</year><month>07</month><volume>299</volume><fpage>329</fpage><lpage>335</lpage><pub-id pub-id-type="doi">10.1016/j.jss.2024.04.060</pub-id><pub-id pub-id-type="medline">38788470</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>WY</given-names> </name></person-group><article-title>ChatGPT goes to the operating room: evaluating GPT-4 performance and its potential in surgical education and training in the era of large language models</article-title><source>Ann Surg Treat Res</source><year>2023</year><month>05</month><volume>104</volume><issue>5</issue><fpage>269</fpage><lpage>273</lpage><pub-id pub-id-type="doi">10.4174/astr.2023.104.5.269</pub-id><pub-id pub-id-type="medline">37179699</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Tasmin</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Performance of multimodal GPT-4V on USMLE with image: potential for imaging diagnostic support with explanations</article-title><source>Radiology and Imaging</source><comment>Preprint posted online on  Nov 5, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.10.26.23297629</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>Hello GPT-4o</article-title><source>OpenAI</source><year>2024</year><access-date>2025-07-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o/">https://openai.com/index/hello-gpt-4o/</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Shao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name></person-group><article-title>OpenAI&#x2019;s GPT-4o in surgical oncology: revolutionary advances in generative artificial intelligence</article-title><source>Eur J Cancer</source><year>2024</year><month>07</month><volume>206</volume><fpage>114132</fpage><pub-id pub-id-type="doi">10.1016/j.ejca.2024.114132</pub-id><pub-id pub-id-type="medline">38810316</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tatekawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Horiuchi</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Diagnostic accuracy of vision-language models on Japanese diagnostic radiology, nuclear medicine, and interventional radiology specialty board examinations</article-title><source>Jpn J Radiol</source><year>2024</year><month>12</month><volume>42</volume><issue>12</issue><fpage>1392</fpage><lpage>1398</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01633-0</pub-id><pub-id pub-id-type="medline">39031270</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hayden</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name><name name-style="western"><surname>Poisson</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Griffith</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klochko</surname><given-names>C</given-names> </name></person-group><article-title>Performance of GPT-4 with Vision on text- and image-based ACR diagnostic radiology in-training examination questions</article-title><source>Radiology</source><year>2024</year><month>09</month><volume>312</volume><issue>3</issue><fpage>e240153</fpage><pub-id pub-id-type="doi">10.1148/radiol.240153</pub-id><pub-id pub-id-type="medline">39225605</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>TC</given-names> </name></person-group><article-title>Custom GPTs enhancing performance and evidence compared with GPT-3.5, GPT-4, and GPT-4o? A study on the Emergency Medicine Specialist Examination</article-title><source>Healthcare (Basel)</source><year>2024</year><month>08</month><day>30</day><volume>12</volume><issue>17</issue><fpage>1726</fpage><pub-id pub-id-type="doi">10.3390/healthcare12171726</pub-id><pub-id pub-id-type="medline">39273750</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>OpenAI</collab></person-group><source>Models</source><year>2024</year><access-date>2025-07-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/models/models">https://platform.openai.com/docs/models/models</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nakajima</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fujimori</surname><given-names>T</given-names> </name><name name-style="western"><surname>Furuya</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A comparison between GPT-3.5, GPT-4, and GPT-4V: can the large language model (ChatGPT) pass the Japanese Board of Orthopaedic Surgery Examination?</article-title><source>Cureus</source><year>2024</year><month>03</month><volume>16</volume><issue>3</issue><fpage>e56402</fpage><pub-id pub-id-type="doi">10.7759/cureus.56402</pub-id><pub-id pub-id-type="medline">38633935</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Koda</surname><given-names>M</given-names> </name><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name></person-group><article-title>The performance of ChatGPT-4V in interpreting images and tables in the Japanese Medical Licensing Exam</article-title><source>JMIR Med Educ</source><year>2024</year><month>05</month><day>23</day><volume>10</volume><fpage>e54283</fpage><pub-id pub-id-type="doi">10.2196/54283</pub-id><pub-id pub-id-type="medline">38787024</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirano</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hanaoka</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><etal/></person-group><article-title>GPT-4 Turbo with vision fails to outperform text-only GPT-4 Turbo in the Japan Diagnostic Radiology Board Examination</article-title><source>Jpn J Radiol</source><year>2024</year><month>08</month><volume>42</volume><issue>8</issue><fpage>918</fpage><lpage>926</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01561-z</pub-id><pub-id pub-id-type="medline">38733472</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ishida</surname><given-names>K</given-names> </name><name name-style="western"><surname>Arisaka</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fujii</surname><given-names>K</given-names> </name></person-group><article-title>Analysis of responses of GPT-4 V to the Japanese National Clinical Engineer Licensing Examination</article-title><source>J Med Syst</source><year>2024</year><month>09</month><day>11</day><volume>48</volume><issue>1</issue><fpage>83</fpage><pub-id pub-id-type="doi">10.1007/s10916-024-02103-w</pub-id><pub-id pub-id-type="medline">39259341</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sawamura</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kohiyama</surname><given-names>K</given-names> </name><name name-style="western"><surname>Takenaka</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sera</surname><given-names>T</given-names> </name><name name-style="western"><surname>Inoue</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nagai</surname><given-names>T</given-names> </name></person-group><article-title>Performance of ChatGPT 4.0 on Japan&#x2019;s National Physical Therapist Examination: a comprehensive analysis of text and visual question handling</article-title><source>Cureus</source><year>2024</year><month>08</month><volume>16</volume><issue>8</issue><fpage>e67347</fpage><pub-id pub-id-type="doi">10.7759/cureus.67347</pub-id><pub-id pub-id-type="medline">39310431</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harigai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Toyama</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nagano</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Response accuracy of GPT-4 across languages: insights from an expert-level diagnostic radiology examination in Japan</article-title><source>Jpn J Radiol</source><year>2025</year><month>02</month><volume>43</volume><issue>2</issue><fpage>319</fpage><lpage>329</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01673-6</pub-id><pub-id pub-id-type="medline">39466356</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bender</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Gebru</surname><given-names>T</given-names> </name><name name-style="western"><surname>McMillan-Major</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shmitchell</surname><given-names>S</given-names> </name></person-group><article-title>On the dangers of stochastic parrots: can language models be too big</article-title><conf-name>Proceedings of the 2021 ACM conference on fairness, accountability, and transparency</conf-name><conf-date>Mar 1, 2021</conf-date><fpage>610</fpage><lpage>623</lpage><pub-id pub-id-type="doi">10.1145/3442188.3445922</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krishna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bhambra</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bleakney</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bhayana</surname><given-names>R</given-names> </name></person-group><article-title>Evaluation of reliability, repeatability, robustness, and confidence of GPT-3.5 and GPT-4 on a radiology board&#x2013;style examination</article-title><source>Radiology</source><year>2024</year><month>05</month><volume>311</volume><issue>2</issue><fpage>e232715</fpage><pub-id pub-id-type="doi">10.1148/radiol.232715</pub-id><pub-id pub-id-type="medline">38771184</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkaissi</surname><given-names>H</given-names> </name><name name-style="western"><surname>McFarlane</surname><given-names>SI</given-names> </name></person-group><article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title><source>Cureus</source><year>2023</year><month>02</month><volume>15</volume><issue>2</issue><fpage>e35179</fpage><pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id><pub-id pub-id-type="medline">36811129</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nakaura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ito</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ueda</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The impact of large language models on radiology: a guide for radiologists on the latest innovations in AI</article-title><source>Jpn J Radiol</source><year>2024</year><month>07</month><volume>42</volume><issue>7</issue><fpage>685</fpage><lpage>696</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01552-0</pub-id><pub-id pub-id-type="medline">38551772</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>LLM's response to intraoperative findings.</p><media xlink:href="mededu_v11i1e69313_app1.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material></app-group></back></article>