<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e48305</article-id>
      <article-id pub-id-type="pmid">37440293</article-id>
      <article-id pub-id-type="doi">10.2196/48305</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Letter to the Editor</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Letter to the Editor</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Variability in Large Language Models’ Responses to Medical Licensing and Certification Examinations. Comment on “How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment”</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Leung</surname>
            <given-names>Tiffany</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gilson</surname>
            <given-names>Aidan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zielinski</surname>
            <given-names>Chris</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Epstein</surname>
            <given-names>Richard H</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Anesthesiology, Perioperative Medicine and Pain Management</institution>
            <institution>University of Miami Miller School of Medicine</institution>
            <addr-line>1400 NW 12th Ave</addr-line>
            <addr-line>Suite 4022F</addr-line>
            <addr-line>Miami, FL, 33136</addr-line>
            <country>United States</country>
            <fax>1 305 689 5501</fax>
            <phone>1 215 896 7850</phone>
            <email>repstein@med.miami.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8466-3845</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Dexter</surname>
            <given-names>Franklin</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5897-2484</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Anesthesiology, Perioperative Medicine and Pain Management</institution>
        <institution>University of Miami Miller School of Medicine</institution>
        <addr-line>Miami, FL</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Division of Management Consulting</institution>
        <institution>Department of Anesthesia</institution>
        <institution>University of Iowa</institution>
        <addr-line>Iowa City, IA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Richard H Epstein <email>repstein@med.miami.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>7</month>
        <year>2023</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e48305</elocation-id>
      <history>
        <date date-type="received">
          <day>18</day>
          <month>4</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>16</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>16</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>22</day>
          <month>6</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Richard H Epstein, Franklin Dexter. Originally published in JMIR Medical Education (https://mededu.jmir.org), 13.07.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2023/1/e48305" xlink:type="simple"/>
      <related-article related-article-type="commentary-article" id="v9i1e45312" ext-link-type="doi" xlink:href="10.2196/45312" vol="9" page="e45312" xlink:type="simple">https://mededu.jmir.org/2023/1/e45312</related-article>
      <related-article related-article-type="commentary" id="v9i1e50336" ext-link-type="doi" xlink:href="10.2196/50336" vol="9" page="e50336" xlink:type="simple">https://mededu.jmir.org/2023/1/e50336/</related-article>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>MedQA</kwd>
        <kwd>generative pre-trained transformer</kwd>
        <kwd>GPT</kwd>
        <kwd>medical education</kwd>
        <kwd>chatbot</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>education technology</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>Google Bard</kwd>
        <kwd>conversational agent</kwd>
        <kwd>machine learning</kwd>
        <kwd>large language models</kwd>
        <kwd>knowledge assessment</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <p>We read with interest the recent study by Gilson and colleagues [<xref ref-type="bibr" rid="ref1">1</xref>], “How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment.” Based on their detailed evaluation of the model’s performance, including content analysis and logical reasoning, the authors suggested that ChatGPT has potential application as a medical education tool to support interactive peer group education. We take no issue with those conclusions. However, what is not emphasized in the article is that search engines often provide different results based on the login credentials of the person executing the search, the location (country), and the device [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Thus, because the performance results presented by the authors did not account for this variability, their single comparisons between the various models against the different sets of questions may be statistically unreliable. Again, we are not suggesting that the authors’ useful conclusions would change, but quantitative performance will differ.</p>
    <p>We evaluated this issue of varying responses using all questions from the most recent quarterly, online, open-book American Board of Preventive Medicine (ABPM) pilot evaluation of a longitudinal assessment program for the maintenance of certification of its clinical informatics diplomates. We evaluated ChatGPT, version 3.5 (OpenAI), and Google Bard (Alphabet Inc) by copying and pasting each of the 12 questions and the corresponding 4-part multiple-choice options into the chatbots’ message boxes on March 30, 2023, and April 1, 2023, respectively. We added a request to provide citations for each question. Both chatbots supplied the option they considered best, with a justification, references, and an explanation as to why each option was either incorrect or inferior to the recommended answer.</p>
    <p>For ChatGPT, the series of 12 questions was performed 10 times in separate chat sessions to avoid memory effects from a previous search, with each session scored against the answer key provided by the ABPM. The results showed that out of the 12 questions, there were 9 sessions where 8 correct responses were achieved and 1 session where 9 correct responses were achieved. Although 8 questions had perfect (10/10) concordance with the answer key, there were 2 questions with 2 different answers and one with 3 different answers. There was a twelfth question where the same answer was provided for each session that disagreed with the answer key. These scores were at least as good as the average performance of the diplomates participating in the maintenance of certification process (61%, to date), which allows the use of online resources, and likely would have represented a passing score. We also evaluated the experimental ChatGPT, version 4.0, in 5 separate chat sessions, which produced sequential scores of 10, 8, 8, 6, and 7. For Google Bard, the process was performed 9 times, and the most common answer was selected as the best response. The modal responses were correct for 7 out of 12 questions (sequential scores of 7, 6, 7, 6, 7, 5, 6, 7, and 8). There were 5 questions for which 2 different answers were provided and 1 question for which all 4 answers were provided as correct answers during different sessions. Google Bard agreed with the ABPM answer key for only 4 questions in all sessions.</p>
    <p>The questions where the large language models consistently disagreed with the ABPM answer key were either based on low-level evidence or involved an opinion on a “best” approach. As implied by Gilson et al [<xref ref-type="bibr" rid="ref1">1</xref>], these dichotomies emphasize the importance of using artificial intelligence products to foster discussion rather than considering them an arbiter of truth. Since both ChatGPT and Google Bard provide justifications and references, groups or individuals using these products for education can learn from the supplied material. If used for such purposes, we recommend submitting questions several times in separate sessions and considering the range of responses.</p>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ABPM</term>
          <def>
            <p>American Board of Preventive Medicine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.2196/45312"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <article-title>Why your Google Search results might differ from other people</article-title>
          <source>Google Search Help</source>
          <access-date>2023-06-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://support.google.com/websearch/answer/12412910?hl=en&#38;sjid=14431510508711933103-NA">https://support.google.com/websearch/answer/12412910?hl=en&#38;sjid=14431510508711933103-NA</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McEvoy</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Reasons Google Search results vary dramatically (updated and expanded)</article-title>
          <source>Web Presence Solutions</source>
          <year>2020</year>
          <month>06</month>
          <day>29</day>
          <access-date>2023-06-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.webpresencesolutions.net/7-reasons-google-search-results-vary-dramatically/">https://www.webpresencesolutions.net/7-reasons-google-search-results-vary-dramatically/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
