@Article{info:doi/10.2196/54393, author="Nakao, Takahiro and Miki, Soichiro and Nakamura, Yuta and Kikuchi, Tomohiro and Nomura, Yukihiro and Hanaoka, Shouhei and Yoshikawa, Takeharu and Abe, Osamu", title="Capability of GPT-4V(ision) in the Japanese National Medical Licensing Examination: Evaluation Study", journal="JMIR Med Educ", year="2024", month="Mar", day="12", volume="10", pages="e54393", keywords="AI; artificial intelligence; LLM; large language model; language model; language models; ChatGPT; GPT-4; GPT-4V; generative pretrained transformer; image; images; imaging; response; responses; exam; examination; exams; examinations; answer; answers; NLP; natural language processing; chatbot; chatbots; conversational agent; conversational agents; medical education", abstract="Background: Previous research applying large language models (LLMs) to medicine was focused on text-based information. Recently, multimodal variants of LLMs acquired the capability of recognizing images. Objective: We aim to evaluate the image recognition capability of generative pretrained transformer (GPT)-4V, a recent multimodal LLM developed by OpenAI, in the medical field by testing how visual information affects its performance to answer questions in the 117th Japanese National Medical Licensing Examination. Methods: We focused on 108 questions that had 1 or more images as part of a question and presented GPT-4V with the same questions under two conditions: (1) with both the question text and associated images and (2) with the question text only. We then compared the difference in accuracy between the 2 conditions using the exact McNemar test. Results: Among the 108 questions with images, GPT-4V's accuracy was 68{\%} (73/108) when presented with images and 72{\%} (78/108) when presented without images (P=.36). For the 2 question categories, clinical and general, the accuracies with and those without images were 71{\%} (70/98) versus 78{\%} (76/98; P=.21) and 30{\%} (3/10) versus 20{\%} (2/10; P≥.99), respectively. Conclusions: The additional information from the images did not significantly improve the performance of GPT-4V in the Japanese National Medical Licensing Examination. ", issn="2369-3762", doi="10.2196/54393", url="https://mededu.jmir.org/2024/1/e54393", url="https://doi.org/10.2196/54393", url="http://www.ncbi.nlm.nih.gov/pubmed/38470459" }