<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e48002</article-id>
      <article-id pub-id-type="pmid">37384388</article-id>
      <article-id pub-id-type="doi">10.2196/48002</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: Comparison Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Venkatesh</surname>
            <given-names>Kaushik</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Kamel Boulos</surname>
            <given-names>Maged N.</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yifeng</surname>
            <given-names>Pan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Biswas</surname>
            <given-names>Som</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sallam</surname>
            <given-names>Malik</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gao</surname>
            <given-names>Aijing</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Thirunavukarasu</surname>
            <given-names>Arun</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Takagi</surname>
            <given-names>Soshi</given-names>
          </name>
          <degrees>BA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-3211-1626</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Watari</surname>
            <given-names>Takashi</given-names>
          </name>
          <degrees>MD, MHQS, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>General Medicine Center</institution>
            <institution>Shimane University Hospital</institution>
            <addr-line>89-1, Enya</addr-line>
            <addr-line>Izumo, 693-8501</addr-line>
            <country>Japan</country>
            <phone>81 0853 20 2217</phone>
            <fax>81 0853 20 2247</fax>
            <email>wataritari@gmail.com</email>
          </address>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9322-8455</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Erabi</surname>
            <given-names>Ayano</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-1871-3543</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Sakaguchi</surname>
            <given-names>Kota</given-names>
          </name>
          <degrees>MD, MBA</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5169-6613</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Faculty of Medicine</institution>
        <institution>Shimane University</institution>
        <addr-line>Izumo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>General Medicine Center</institution>
        <institution>Shimane University Hospital</institution>
        <addr-line>Izumo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Internal Medicine</institution>
        <institution>University of Michigan Medical School</institution>
        <addr-line>Ann Arbor, MI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Medicine Service</institution>
        <institution>VA Ann Arbor Healthcare System</institution>
        <addr-line>Ann Arbor, MI</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Takashi Watari <email>wataritari@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>29</day>
        <month>6</month>
        <year>2023</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e48002</elocation-id>
      <history>
        <date date-type="received">
          <day>7</day>
          <month>4</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>14</day>
          <month>6</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Soshi Takagi, Takashi Watari, Ayano Erabi, Kota Sakaguchi. Originally published in JMIR Medical Education (https://mededu.jmir.org), 29.06.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2023/1/e48002" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The competence of ChatGPT (Chat Generative Pre-Trained Transformer) in non-English languages is not well studied.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study compared the performances of GPT-3.5 (Generative Pre-trained Transformer) and GPT-4 on the Japanese Medical Licensing Examination (JMLE) to evaluate the reliability of these models for clinical reasoning and medical knowledge in non-English languages.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This study used the default mode of ChatGPT, which is based on GPT-3.5; the GPT-4 model of ChatGPT Plus; and the 117th JMLE in 2023. A total of 254 questions were included in the final analysis, which were categorized into 3 types, namely general, clinical, and clinical sentence questions.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The results indicated that GPT-4 outperformed GPT-3.5 in terms of accuracy, particularly for general, clinical, and clinical sentence questions. GPT-4 also performed better on difficult questions and specific disease questions. Furthermore, GPT-4 achieved the passing criteria for the JMLE, indicating its reliability for clinical reasoning and medical knowledge in non-English languages.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>GPT-4 could become a valuable tool for medical education and clinical support in non–English-speaking regions, such as Japan.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>Chat Generative Pre-trained Transformer</kwd>
        <kwd>GPT-4</kwd>
        <kwd>Generative Pre-trained Transformer 4</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>medical education</kwd>
        <kwd>Japanese Medical Licensing Examination</kwd>
        <kwd>medical licensing</kwd>
        <kwd>clinical support</kwd>
        <kwd>learning model</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>ChatGPT (Chat Generative Pre-trained Transformer; OpenAI) is a state-of-the-art large language model (LLM) that can simulate human-like conversations based on user input [<xref ref-type="bibr" rid="ref1">1</xref>]. As a continually evolving model in natural language processing (NLP), ChatGPT has the potential to be a valuable tool for clinical support and medical education, as already explored by Microsoft and OpenAI [<xref ref-type="bibr" rid="ref2">2</xref>]. Studies have revealed that ChatGPT provided highly accurate answers to the US Certified Public Accountant exam and the US bar exam [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. In the medical domain, ChatGPT achieved the passing criteria for the US Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Although challenges persist in applying ChatGPT to clinical medicine [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], it has demonstrated sufficient performance in English examinations [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      <p>However, in a previous study, ChatGPT, based on GPT-3.5 (Generative Pre-trained Transformer), performed poorly for 77 out of 79 medical students on a South Korean parasitology examination, which resulted in questions about its ability to provide medically accurate responses in non-English languages [<xref ref-type="bibr" rid="ref11">11</xref>]. On March 14, 2023, OpenAI unveiled GPT-4, the latest version of its LLM [<xref ref-type="bibr" rid="ref12">12</xref>]. Compared with its predecessor GPT-3.5, GPT-4 is “more reliable, creative, and able to handle many more nuanced instructions” [<xref ref-type="bibr" rid="ref12">12</xref>]. OpenAI announced that GPT-4 could perform well in academic and specialized fields [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], and its performance in languages other than English was enhanced. However, OpenAI has yet to verify the performance of GPT-4 in the medical field in Japanese. When considering the application of GPT-4 to medical education and clinical practice in non–English-speaking regions, confirming its reliability for clinical reasoning and medical knowledge in non-English languages is critical [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
      <p>Therefore, this study compared the accuracy of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination (JMLE) [<xref ref-type="bibr" rid="ref15">15</xref>]. Furthermore, the accuracy of each model was compared for various question types and difficulty levels.</p>
    </sec>
    <sec sec-type="method">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We used the default mode of ChatGPT, which is based on GPT-3.5, and the GPT-4 model of ChatGPT Plus. The latest JMLE, number 117, conducted on February 4 and 5, 2023, was also used for this study. The JMLE comprises 400 questions, which were classified into 3 categories: essential knowledge questions, which test the knowledge and ethics required of a doctor; general clinical questions, which cover numerous diseases; and specific disease questions, which test the knowledge of each disease [<xref ref-type="bibr" rid="ref15">15</xref>]. Furthermore, we categorized those questions into 3 types: general questions that tested knowledge of a specific topic, clinical questions that required case presentation and clinical reasoning, and clinical sentence questions with several questions in a single case. The passing criteria of the 117th JMLE are as follows: a minimum score of 80% on the essential knowledge questions and 74.6% on the remaining questions [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. The exclusion criteria included questions for which the Ministry of Health, Labour and Welfare (MHLW) announced as being excluded (n=5), as well as questions containing tables (n=7), images (n=125), and underlining (n=9), which are not recognized by ChatGPT. In total, 254 questions were used in the final analysis.</p>
        <p>Questions and their multiple-choice answers from the JMLE were used in their original Japanese form, as was the official national examination rubric. Instructions for using ChatGPT were also provided in Japanese. A typical rubric is as follows:</p>
        <disp-quote>
          <p>We will present questions for the Japanese National Medical Examination. There will be five options from a to e, and you must choose the appropriate option for the question. If there is no specific limit on the number of options to choose, please select one option only.</p>
          <attrib>
  15
</attrib>
        </disp-quote>
        <p>The definition of “correct” answers to the questions asked to GPT-3.5 and GPT-4 was based on the answers to the JMLE, which were published on the website of the MHLW [<xref ref-type="bibr" rid="ref15">15</xref>]. Only the answers that were clearly correct and followed the instructions provided in the question text were considered “correct.” Ambiguous answers, evident mistakes, and responses with an excessive number of candidates were considered incorrect.</p>
        <p>We evaluated the difficulty level of each question and categorized them as hard (n=82), normal (n=112), and easy (n=60) based on the correct response rate published by medu4, a preparatory school for the JMLE [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Questions with a correct response rate of 79.9% or below were classified as hard, those with a rate between 80% and 96.9% were classified as normal, and those with a rate of 97% or higher were classified as easy.</p>
        <p>Finally, we simultaneously collected responses from both GPT-3.5 and GPT-4 between March 16 and 18, 2023, and scored them using the definition of correct answers. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> shows examples of the JMLE questions inputted into both models.</p>
        <p>Standard descriptive statistics were used to calculate the numbers, proportions, and means for each data set. The McNemar test was used to compare correct response rates. All analyses were performed using the Stata statistical software (StataCorp LLC) [<xref ref-type="bibr" rid="ref18">18</xref>]. All tests were 2-tailed, and statistical significance was set at <italic>P</italic>&#60;.05.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study only used information that was already published on the internet and did not involve human subjects; rather, an analysis of the JMLE was performed. Therefore, approval by the Institutional Review Board of Shimane University was not required.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>A total of 254 questions from the 117th JMLE were used in the experiment. <xref ref-type="table" rid="table1">Table 1</xref> presents the percentage of correct responses to essential knowledge questions and other questions on the JMLE. Overall, GPT-4 significantly outperformed GPT-3.5 by 29.1% (<italic>P</italic>&#60;.001). In terms of the correct response rate for individual questions, the examinees’ rate for essential knowledge questions was 89.2% compared to 87.2% for GPT-4. Notably, this represents a considerable 32.1% improvement over GPT-3.5, which had a 55.1% correct response rate. Similarly, a 29.5% increase was observed for general clinical questions, and a 25.4% increase was observed for specific disease questions. In all cases, GPT-4 achieved the passing rates for the JMLE. However, none of these rates exceeded the total percentage of correct answers by examinees.</p>
      <p><xref ref-type="table" rid="table2">Table 2</xref> presents the correct response rates according to the question type, with GPT-3.5 achieving correct response rates of approximately 50%—none of which are passing scores. However, GPT-4 achieved a 27.6% increase for general questions (<italic>P</italic>&#60;.001) and a 29.6% increase for clinical questions (<italic>P</italic>&#60;.001) compared to GPT-3.5. Notably, a 36.3% increase was observed in the number of correct responses to clinical sentence questions, with a significant improvement in all question types (all <italic>P</italic>&#60;.05).</p>
      <p><xref ref-type="table" rid="table3">Table 3</xref> presents the correct response rates by difficulty level. GPT-3.5 only achieved a 69.5% correct response rate for easy-level questions, 46.2% for normal-level questions, and 33.3% for hard-level questions. None of these values were close to the passing criteria. However, GPT-4 exhibited improved performance, with a 40% increase for hard-level questions (<italic>P</italic>&#60;.001), a 31.5% increase for normal-level questions (<italic>P</italic>&#60;.001), and an 18.3% increase for easy-level questions (<italic>P</italic>&#60;.001).</p>
      <p>Finally, GPT-4 significantly outperformed GPT-3.5 in all formats in terms of correct response rates (all <italic>P</italic>&#60;.05). In particular, for hard-level questions, the correct response rate of GPT-4 was 17% higher than the examinees’ average correct response rate.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Comparison of GPT-3.5 (Generative Pre-trained Transformer) and GPT-4 for essential knowledge questions and other questions in the Japanese Medical Licensing Examination (JMLE).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="200"/>
          <col width="180"/>
          <col width="180"/>
          <col width="180"/>
          <col width="180"/>
          <col width="80"/>
          <thead>
            <tr valign="top">
              <td>Question category</td>
              <td>Question (n=254), n (%)</td>
              <td>Examinee correct response rate<sup>a</sup> (%)</td>
              <td>GPT-3.5 correct response rate (%; 95% CI)</td>
              <td>GPT-4 correct response rate (%; 95% CI)</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>All questions</td>
              <td>254 (100)</td>
              <td>84.9</td>
              <td>50.8 (44.6-57.0)</td>
              <td>79.9 (75.0-84.9)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>Essential knowledge</td>
              <td>78 (30.7)</td>
              <td>89.2</td>
              <td>55.1 (43.8-66.4)</td>
              <td>87.2 (79.6-94.8)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>General clinical</td>
              <td>105 (41.3)</td>
              <td>83.1</td>
              <td>43.8 (34.2-53.5)</td>
              <td>73.3 (64.7-81.9)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>Specific disease</td>
              <td>71 (28)</td>
              <td>83</td>
              <td>56.3 (44.5-68.2)</td>
              <td>81.7 (72.5-90.9)</td>
              <td>&#60;.001</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>The correct response rates of examinees were obtained from the 117th JMLE, as announced by the Ministry of Health, Labour and Welfare [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Comparison of GPT-3.5 (Generative Pre-trained Transformer) and GPT-4 by question type in the Japanese Medical Licensing Examination (JMLE).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="170"/>
          <col width="140"/>
          <col width="200"/>
          <col width="200"/>
          <col width="180"/>
          <col width="110"/>
          <thead>
            <tr valign="top">
              <td>Question type</td>
              <td>Question (n=254), n (%)</td>
              <td>Examinee correct response rate<sup>a</sup> (%)</td>
              <td>GPT-3.5 correct response rate (%; 95% CI)</td>
              <td>GPT-4 correct response rate (%; 95% CI)</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>General</td>
              <td>134 (52.7)</td>
              <td>84</td>
              <td>51.5 (42.9-60.0)</td>
              <td>79.1 (72.1-86.1)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>Clinical</td>
              <td>98 (38.6)</td>
              <td>85.3</td>
              <td>50 (39.9-60.1)</td>
              <td>79.6 (71.5-87.7)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>Clinical sentence</td>
              <td>22 (8.7)</td>
              <td>88.8</td>
              <td>50 (27.3-72.7)</td>
              <td>86.3 (70.8-102)</td>
              <td>.005</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>The correct response rates of examinees were obtained from the 117th JMLE, as announced by the Ministry of Health, Labour and Welfare [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Comparison of GPT-3.5 (Generative Pre-trained Transformer) and GPT-4 in the Japanese Medical Licensing Examination (JMLE) by difficulty levela.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="140"/>
          <col width="190"/>
          <col width="190"/>
          <col width="200"/>
          <col width="180"/>
          <col width="100"/>
          <thead>
            <tr valign="top">
              <td>Difficulty level</td>
              <td>Question (n=254), n (%)</td>
              <td>Examinee correct response rate<sup>b</sup> (%)</td>
              <td>GPT-3.5 correct response rate (%; 95% CI)</td>
              <td>GPT-4 correct response rate (%; 95% CI)</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Easy</td>
              <td>82 (32.3)</td>
              <td>98.7</td>
              <td>69.5 (59.3-79.7)</td>
              <td>87.8 (80.6-95.0)</td>
              <td>.001</td>
            </tr>
            <tr valign="top">
              <td>Normal</td>
              <td>112 (44.1)</td>
              <td>90.2</td>
              <td>46.2 (37.0-55.8)</td>
              <td>77.7 (69.8-85.5)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>Hard</td>
              <td>60 (23.6)</td>
              <td>56.3</td>
              <td>33.3% (21.1-45.6)</td>
              <td>73.3 (61.8-84.8)</td>
              <td>&#60;.001</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup>Difficulty level was classified by the percentage of correct responses provided by medu4 [<xref ref-type="bibr" rid="ref16">16</xref>], Japan’s leading preparatory school for the JMLE: easy, &#62;97%; normal, 80% to 96.9%; and hard, &#60;79.9%.</p>
          </fn>
          <fn id="table3fn2">
            <p><sup>b</sup>The correct response rates of examinees were obtained from the 117th JMLE, as announced by the Ministry of Health, Labour and Welfare [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>We compared the correct response rates of GPT-3.5 and GPT-4 on the 2023 JMLE. GPT-3.5 did not satisfy the passing criteria, whereas GPT-4 achieved the required scores. Furthermore, GPT-4 demonstrated a significantly improved correct response rates compared with GPT-3.5 across various question types and difficulty levels. The correct response rate of GPT-4 was particularly enhanced for the challenging hard-level questions and surpassed the average correct response rate of actual examinees. Based on these results, we discuss 2 factors that explain the significant improvement in the correct response rates of GPT-4 on the JMLE.</p>
        <p>First, we ascribe this enhancement to the augmented NLP capabilities in non-English languages. A performance disparity between English and other languages in LLMs is ubiquitous in NLP [<xref ref-type="bibr" rid="ref19">19</xref>]. Additionally, GPT-3.5 exhibits a decline in NLP proficiency in non-English languages relative to English [<xref ref-type="bibr" rid="ref20">20</xref>]. Although GPT-3.5 passed the USMLE, an English language–based medical examination, it did not satisfy the passing criteria for the JMLE. In contrast, GPT-4 satisfied the JMLE passing criteria, demonstrating a significant advancement in NLP capabilities, specifically in Japanese. OpenAI assessed GPT-4’s performance in non-English languages, which yielded higher proficiencies in 24 out of 26 languages as compared to the previous models’ proficiency in English [<xref ref-type="bibr" rid="ref13">13</xref>]. Although OpenAI did not disclose the precise methodologies used to obtain these outcomes, the results of this research validate their assertion.</p>
        <p>Second, since improving the information processing capabilities in professional and academic domains is imperative, OpenAI’s development of GPT-4 aimed to handle more intricate and nuanced tasks beyond those encountered in many real-world situations [<xref ref-type="bibr" rid="ref13">13</xref>]. The JMLE is a mandatory exam for certifying medical practitioners in Japan, necessitating a comprehensive knowledge base and strong clinical reasoning skills. GPT-3.5’s performance fell short of the JMLE passing criteria, whereas GPT-4 made significant improvements in professional and academic processing capabilities in a brief time frame. Notably, GPT-4’s superior correct response rate on the challenging hard-level questions, compared with the average correct response rate of general examinees, indicates the potential of language models such as GPT-4 to surpass human performance in highly specialized fields [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
        <p>As the results of this study and several previous studies indicate, LLMs such as ChatGPT have made remarkable progress [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. However, we should be careful when directly applying LLMs in clinical practice and education without critical scrutiny [<xref ref-type="bibr" rid="ref9">9</xref>]. For example, the most essential challenge to address is hallucination. Hallucination is defined as “producing nonsensical or untruthful content concerning certain sources.” OpenAI reported that hallucinations have been mitigated in GPT-4 compared with GPT-3.5 [<xref ref-type="bibr" rid="ref21">21</xref>]. With advancements in LLMs, hallucinations may be further reduced in the future. Future studies should discuss the quality level of LLMs that is required. A previous study suggests that even in English, in a real clinical setting, GPT-3.5 cannot answer questions at a level acceptable to fully qualified primary care physicians [<xref ref-type="bibr" rid="ref10">10</xref>]. However, LLMs such as GPT-4 exhibit considerable potential for use in clinical sites and medical education. For instance, ChatGPT has been used to generate differential diagnoses [<xref ref-type="bibr" rid="ref22">22</xref>]. Furthermore, the potential of ChatGPT for improving the diagnosis and treatment of epilepsy and contributions to public health improvement has been investigated [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Limitation</title>
        <p>This study had several limitations. First, the results reflect the capabilities of ChatGPT as of March 17 and 18, 2023, and different results could be obtained even if the same methods were used. The knowledge and interpretation capabilities of ChatGPT will rapidly improve in the future because of user feedback and deep learning. Second, although GPT-4 is a multimodal artificial intelligence that is inherently capable of inputting images and tables, among other things, this study excluded them for an accurate comparison with GPT-3.5, and only text questions were used. Third, the JMLE has a supplementary assessment that states that if an absolute contraindication answer is selected 2 or more times, the applicant will fail the examination, even if they have achieved the passing scores [<xref ref-type="bibr" rid="ref15">15</xref>]. Because the scores of failed applicants were not published by the MHLW, they were not included in the evaluation. Finally, this investigation focused exclusively on ChatGPT. However, other LLMs such as Google’s Bard (PaLM2) and Large Language Model Meta AI (LLaMA) have advanced considerably and are being improved continuously [<xref ref-type="bibr" rid="ref26">26</xref>]. In the future, the possibility of implementing LLMs other than ChatGPT in the medical field must be considered.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>GPT-4 passed the 117th JMLE, whereas GPT-3.5 failed the examination. This phenomenon revealed GPT-4’s rapid evolution in Japanese language processing. Investigations are necessary to evaluate its safety, efficiency, and cost-effectiveness for potential application as an LLM artificial intelligence tool for medical practice support, learning in clinical settings, and medical education.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Examples of the Japanese Medical Licensing Examination questions inputted into ChatGPT (Chat Generative Pre-trained Transformer; left) and GPT-4 (Generative Pre-trained Transformer-4; right). In the instructions, the text of the Japanese National Medical Examination was used as it is, without any changes.</p>
        <media xlink:href="mededu_v9i1e48002_app1.png" xlink:title="PNG File , 204 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ChatGPT</term>
          <def>
            <p>Chat Generative Pre-trained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GPT</term>
          <def>
            <p>Generative Pre-trained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">JMLE</term>
          <def>
            <p>Japanese Medical Licensing Examination</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLaMA</term>
          <def>
            <p>Large Language Model Meta AI</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MHLW</term>
          <def>
            <p>Ministry of Health, Labour and Welfare</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">USMLE</term>
          <def>
            <p>US Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors express their appreciation to the members of the Shimane General Medicine Center, particularly Dr Kazumichi Onigata, Dean of the Faculty of Medicine, Shimane University, and Dr Yoshihiko Shiraishi, Director of the Shimane General Medicine Center, for their careful guidance.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>Data supporting the findings of this study are available from the corresponding author (TW) upon request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing ChatGPT</article-title>
          <source>OpenAI</source>
          <access-date>2022-11-30</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/chatgpt/">https://openai.com/blog/chatgpt/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harsha</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>McKinney</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Carignan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Horvitz</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Capabilities of GPT-4 on medical challenge problems</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 20, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bommarito</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bommarito</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>GPT as knowledge worker: a zero-shot evaluation of (AI)CPA capabilities</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 11, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2301.04408</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bommarito</surname>
              <given-names>MJ II</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>GPT takes the bar exam</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on December 29, 2022</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2212.14402</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? the implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>Camille</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <month>03</month>
          <day>19</day>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>887</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11060887"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
          <pub-id pub-id-type="medline">36981544</pub-id>
          <pub-id pub-id-type="pii">healthcare11060887</pub-id>
          <pub-id pub-id-type="pmcid">PMC10048148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dada</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kleesiek</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Egger</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in healthcare: a taxonomy and systematic review</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted on online on March 30, 2023</comment>
          <pub-id pub-id-type="doi">10.1101/2023.03.30.23287899</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Petro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <month>03</month>
          <day>30</day>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1233</fpage>
          <lpage>1239</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id>
          <pub-id pub-id-type="medline">36988602</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sanghera</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Barzangi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>El Mukashfi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Trialling a large language model (ChatGPT) in general practice with the applied knowledge test: observational study demonstrating opportunities and limitations in primary care</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>04</month>
          <day>21</day>
          <volume>9</volume>
          <fpage>e46599</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46599/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46599</pub-id>
          <pub-id pub-id-type="medline">37083633</pub-id>
          <pub-id pub-id-type="pii">v9i1e46599</pub-id>
          <pub-id pub-id-type="pmcid">PMC10163403</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Are ChatGPT’s knowledge and interpretation ability comparable to those of medical students in Korea for taking a parasitology examination?: a descriptive study</article-title>
          <source>J Educ Eval Health Prof</source>
          <year>2023</year>
          <month>1</month>
          <day>11</day>
          <volume>20</volume>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36627845"/>
          </comment>
          <pub-id pub-id-type="doi">10.3352/jeehp.2023.20.1</pub-id>
          <pub-id pub-id-type="medline">36627845</pub-id>
          <pub-id pub-id-type="pii">jeehp.2023.20.1</pub-id>
          <pub-id pub-id-type="pmcid">PMC9905868</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <article-title>GPT-4 is OpenAI’s most advanced system, producing safer and more useful responses</article-title>
          <source>OpenAI</source>
          <access-date>2023-03-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/product/gpt-4">https://openai.com/product/gpt-4</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 15, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title>
          <source>Research Square</source>
          <comment>Preprint posted online on February 16, 2023</comment>
          <pub-id pub-id-type="doi">10.21203/rs.3.rs-2584079/v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <article-title>Announcement of Successful Passage of the 117th National Medical Examination. Article in Japanese</article-title>
          <source>Ministry of Health, Labour and Welfare (Japan)</source>
          <access-date>2023-03-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mhlw.go.jp/general/sikaku/successlist/2023/siken01/about.html">https://www.mhlw.go.jp/general/sikaku/successlist/2023/siken01/about.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <source>medu4</source>
          <access-date>2023-03-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medu4.net/">https://www.medu4.net/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <article-title>Searching questions. Article in Japanese</article-title>
          <source>medu4</source>
          <access-date>2023-03-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medu4.com/quizzes/search">https://medu4.com/quizzes/search</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>StataCorp</collab>
          </person-group>
          <source>Stata 17 Base Reference Manual</source>
          <year>2021</year>
          <publisher-loc>College Station, TX</publisher-loc>
          <publisher-name>Stata Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bender</surname>
              <given-names>EM</given-names>
            </name>
          </person-group>
          <article-title>The #BenderRule: on naming the languages we study and why it matters</article-title>
          <source>The Gradient</source>
          <year>2009</year>
          <month>9</month>
          <day>14</day>
          <access-date>2023-03-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://thegradient.pub/the-benderrule-on-naming-the-languages-we-study-and-why-it-matters/">https://thegradient.pub/the-benderrule-on-naming-the-languages-we-study-and-why-it-matters/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seghier</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: not all languages are equal</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>03</month>
          <volume>615</volume>
          <issue>7951</issue>
          <fpage>216</fpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00680-3</pub-id>
          <pub-id pub-id-type="medline">36882613</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00680-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <article-title>GPT-4 system card</article-title>
          <source>OpenAI</source>
          <year>2023</year>
          <month>3</month>
          <day>23</day>
          <access-date>2023-03-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cdn.openai.com/papers/gpt-4-system-card.pdf">https://cdn.openai.com/papers/gpt-4-system-card.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yokose</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sakamoto</surname>
              <given-names>Tetsu</given-names>
            </name>
            <name name-style="western">
              <surname>Kawamura</surname>
              <given-names>Ren</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>Taro</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy of differential-diagnosis lists generated by generative pretrained transformer 3 chatbot for clinical vignettes with common chief complaints: a pilot study</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2023</year>
          <month>02</month>
          <day>15</day>
          <volume>20</volume>
          <issue>4</issue>
          <fpage>3378</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph20043378"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph20043378</pub-id>
          <pub-id pub-id-type="medline">36834073</pub-id>
          <pub-id pub-id-type="pii">ijerph20043378</pub-id>
          <pub-id pub-id-type="pmcid">PMC9967747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boßelmann</surname>
              <given-names>Christian M</given-names>
            </name>
            <name name-style="western">
              <surname>Leu</surname>
              <given-names>Costin</given-names>
            </name>
            <name name-style="western">
              <surname>Lal</surname>
              <given-names>Dennis</given-names>
            </name>
          </person-group>
          <article-title>Are AI language models such as ChatGPT ready to improve the care of individuals with epilepsy?</article-title>
          <source>Epilepsia</source>
          <year>2023</year>
          <month>05</month>
          <volume>64</volume>
          <issue>5</issue>
          <fpage>1195</fpage>
          <lpage>1199</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1111/epi.17570"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/epi.17570</pub-id>
          <pub-id pub-id-type="medline">36869421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biswas</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>Role of Chat GPT in public health</article-title>
          <source>Ann Biomed Eng</source>
          <year>2023</year>
          <month>05</month>
          <volume>51</volume>
          <issue>5</issue>
          <fpage>868</fpage>
          <lpage>869</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1007/s10439-023-03172-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03172-7</pub-id>
          <pub-id pub-id-type="medline">36920578</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03172-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Pandey</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yau</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Teng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ashraf</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Singla</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Insights from teaching artificial intelligence to medical students in Canada</article-title>
          <source>Commun Med (Lond)</source>
          <year>2022</year>
          <month>6</month>
          <day>3</day>
          <volume>2</volume>
          <fpage>63</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35668847"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s43856-022-00125-4</pub-id>
          <pub-id pub-id-type="medline">35668847</pub-id>
          <pub-id pub-id-type="pii">125</pub-id>
          <pub-id pub-id-type="pmcid">PMC9166802</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ranjani</surname>
              <given-names>HG</given-names>
            </name>
          </person-group>
          <article-title>Observations on LLMs for telecom domain: capabilities and limitations</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 22, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2305.13102</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
