<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e50336</article-id>
      <article-id pub-id-type="pmid">37440299</article-id>
      <article-id pub-id-type="doi">10.2196/50336</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Letter to the Editor</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Letter to the Editor</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Authors’ Reply to: Variability in Large Language Models’ Responses to Medical Licensing and Certification Examinations</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Leung</surname>
            <given-names>Tiffany</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Gilson</surname>
            <given-names>Aidan</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4770-4705</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Safranek</surname>
            <given-names>Conrad W</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1985-9432</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Thomas</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9056-7016</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Socrates</surname>
            <given-names>Vimig</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7955-9875</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Chi</surname>
            <given-names>Ling</given-names>
          </name>
          <degrees>BSE</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8270-9245</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Taylor</surname>
            <given-names>Richard Andrew</given-names>
          </name>
          <degrees>MD, MHS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9082-6644</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Chartash</surname>
            <given-names>David</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Section for Biomedical Informatics and Data Science</institution>
            <institution>Yale University School of Medicine</institution>
            <addr-line>100 College Street, 9th Fl</addr-line>
            <addr-line>New Haven, CT, 06510</addr-line>
            <country>United States</country>
            <phone>1 203 737 5379</phone>
            <email>david.chartash@yale.edu</email>
          </address>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0265-330X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Section for Biomedical Informatics and Data Science</institution>
        <institution>Yale University School of Medicine</institution>
        <addr-line>New Haven, CT</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>Yale University School of Medicine</institution>
        <addr-line>New Haven, CT</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Program of Computational Biology and Bioinformatics</institution>
        <institution>Yale University</institution>
        <addr-line>New Haven, CT</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>School of Medicine</institution>
        <institution>University College Dublin</institution>
        <institution>National University of Ireland, Dublin</institution>
        <addr-line>Dublin</addr-line>
        <country>Ireland</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: David Chartash <email>david.chartash@yale.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>7</month>
        <year>2023</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e50336</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>5</day>
          <month>7</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Aidan Gilson, Conrad W Safranek, Thomas Huang, Vimig Socrates, Ling Chi, Richard Andrew Taylor, David Chartash. Originally published in JMIR Medical Education (https://mededu.jmir.org), 13.07.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2023/1/e50336" xlink:type="simple"/>
      <related-article related-article-type="commentary-article" id="v9i1e48305" ext-link-type="doi" xlink:href="10.2196/48305" vol="9" page="e48305" xlink:type="simple">https://mededu.jmir.org/2023/1/e48305/</related-article>
      <related-article related-article-type="commentary-article" id="v9i1e45312" ext-link-type="doi" xlink:href="10.2196/45312" vol="9" page="e45312" xlink:type="simple">https://mededu.jmir.org/2023/1/e45312</related-article>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>MedQA</kwd>
        <kwd>generative pre-trained transformer</kwd>
        <kwd>GPT</kwd>
        <kwd>medical education</kwd>
        <kwd>chatbot</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>education technology</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>conversational agent</kwd>
        <kwd>machine learning</kwd>
        <kwd>large language models</kwd>
        <kwd>knowledge assessment</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <p>We thank Epstein and Dexter [<xref ref-type="bibr" rid="ref1">1</xref>] for their close reading of our paper, “How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment” [<xref ref-type="bibr" rid="ref2">2</xref>]. In response to their comments, we present the following points for clarification:</p>
    <list list-type="bullet">
      <list-item>
        <p>While search engines such as Bing (Microsoft Corp) and Google (Google LLC) have been noted to implement geographic tuning when presenting their information retrieval results, there is no evidence or documentation that the version of ChatGPT (OpenAI) used in our work similarly alters its output given the geolocation of the user or the device that is being used. Notably, however, the integration of ChatGPT into other online services, such as Bing or Snapchat (Snap Inc), has made the information provided to those services (eg, time zone or geolocation) available to ChatGPT [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      </list-item>
      <list-item>
        <p>Additionally, although it may be true that (dialectic) grammatical differences in the English language result in variability that may mimic the variability of prompt engineering, there is no empirical evidence that this alters the performance of ChatGPT. Further examination of the correlation between prompt engineering methods and within-sentence grammatical tuning or variability may alleviate these concerns in future research.</p>
      </list-item>
      <list-item>
        <p>Although it is a medical knowledge–based examination, the American Board of Preventive Medicine Longitudinal Assessment Program pilot for clinical informatics is not equivalent to the USMLE (United States Medical Licensing Examination). ChatGPT’s performance on this maintenance of certification examination has been examined by Kumah-Crystal et al [<xref ref-type="bibr" rid="ref4">4</xref>], and we defer to their assessment as a more apt comparator.</p>
      </list-item>
      <list-item>
        <p>While Epstein and Dexter [<xref ref-type="bibr" rid="ref1">1</xref>] offer a comparison between ChatGPT 3.5, ChatGPT 4.0, and Google Bard, it is unclear as to how the three have been statistically compared in terms of sample size and answer quality beyond performance on multiple-choice questions. Bootstrapping responses appear to address an element of variability in large language model (LLM) responses; however, a more robust statistical comparison is warranted alongside a comparison of nonbinarized LLM output performance.</p>
      </list-item>
      <list-item>
        <p>While there is no doubt that there is variability in the responses of LLMs to identical inputs (as these tools are nondeterministic in character), we do not believe this devalues the statistical significance or the quantitative validity of our results. As we are evaluating the performance of ChatGPT in the same situation as a student examinee, a single response is more applicable. Additionally, since we used a large sample size of questions, which accounted for model variability, we elected not to repeat questions multiple times.</p>
      </list-item>
    </list>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Epstein</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dexter</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Variability in Large Language Models’ Responses to Medical Licensing and Certification Examinations. Comment on “How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment”</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48305</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023/1/e48305/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48305</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.2196/45312"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <article-title>How my AI uses location data</article-title>
          <source>Snapchat Support</source>
          <access-date>2023-06-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://archive.is/wcmk3">https://archive.is/wcmk3</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumah-Crystal</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mankowitz</surname>
              <given-names>Scott</given-names>
            </name>
            <name name-style="western">
              <surname>Embi</surname>
              <given-names>Peter</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>Christoph U</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and the clinical informatics board examination: the end of unproctored maintenance of certification?</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2023</year>
          <month>06</month>
          <day>19</day>
          <fpage>104</fpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocad104</pub-id>
          <pub-id pub-id-type="medline">37335851</pub-id>
          <pub-id pub-id-type="pii">7202064</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
