<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i1e50842</article-id>
      <article-id pub-id-type="pmid">38236632</article-id>
      <article-id pub-id-type="doi">10.2196/50842</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of ChatGPT on Ophthalmology-Related Questions Across Various Examination Levels: Observational Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Venkatesh</surname>
            <given-names>Kaushik</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Saxena</surname>
            <given-names>Amit</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wang</surname>
            <given-names>Yvette</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Haddad</surname>
            <given-names>Firas</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3420-3880</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Saade</surname>
            <given-names>Joanna S</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Ophthalmology</institution>
            <institution>American University of Beirut Medical Center</institution>
            <addr-line>Bliss Street</addr-line>
            <addr-line>Beirut, 1107 2020</addr-line>
            <country>Lebanon</country>
            <phone>961 1350000 ext 8031</phone>
            <email>js62@aub.edu.lb</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1098-4923</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Faculty of Medicine</institution>
        <institution>American University of Beirut</institution>
        <addr-line>Beirut</addr-line>
        <country>Lebanon</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Ophthalmology</institution>
        <institution>American University of Beirut Medical Center</institution>
        <addr-line>Beirut</addr-line>
        <country>Lebanon</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Joanna S Saade <email>js62@aub.edu.lb</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>1</month>
        <year>2024</year>
      </pub-date>
      <volume>10</volume>
      <elocation-id>e50842</elocation-id>
      <history>
        <date date-type="received">
          <day>14</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>9</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>12</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Firas Haddad, Joanna S Saade. Originally published in JMIR Medical Education (https://mededu.jmir.org), 18.01.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2024/1/e50842" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>ChatGPT and language learning models have gained attention recently for their ability to answer questions on various examinations across various disciplines. The question of whether ChatGPT could be used to aid in medical education is yet to be answered, particularly in the field of ophthalmology.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study is to assess the ability of ChatGPT-3.5 (GPT-3.5) and ChatGPT-4.0 (GPT-4.0) to answer ophthalmology-related questions across different levels of ophthalmology training.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Questions from the United States Medical Licensing Examination (USMLE) steps 1 (n=44), 2 (n=60), and 3 (n=28) were extracted from AMBOSS, and 248 questions (64 easy, 122 medium, and 62 difficult questions) were extracted from the book, <italic>Ophthalmology Board Review Q&#38;A</italic>, for the Ophthalmic Knowledge Assessment Program and the Board of Ophthalmology (OB) Written Qualifying Examination (WQE). Questions were prompted identically and inputted to GPT-3.5 and GPT-4.0.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>GPT-3.5 achieved a total of 55% (n=210) of correct answers, while GPT-4.0 achieved a total of 70% (n=270) of correct answers. GPT-3.5 answered 75% (n=33) of questions correctly in USMLE step 1, 73.33% (n=44) in USMLE step 2, 60.71% (n=17) in USMLE step 3, and 46.77% (n=116) in the OB-WQE. GPT-4.0 answered 70.45% (n=31) of questions correctly in USMLE step 1, 90.32% (n=56) in USMLE step 2, 96.43% (n=27) in USMLE step 3, and 62.90% (n=156) in the OB-WQE. GPT-3.5 performed poorer as examination levels advanced (<italic>P</italic>&#60;.001), while GPT-4.0 performed better on USMLE steps 2 and 3 and worse on USMLE step 1 and the OB-WQE (<italic>P</italic>&#60;.001). The coefficient of correlation (<italic>r</italic>) between ChatGPT answering correctly and human users answering correctly was 0.21 (<italic>P</italic>=.01) for GPT-3.5 as compared to –0.31 (<italic>P</italic>&#60;.001) for GPT-4.0. GPT-3.5 performed similarly across difficulty levels, while GPT-4.0 performed more poorly with an increase in the difficulty level. Both GPT models performed significantly better on certain topics than on others.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>ChatGPT is far from being considered a part of mainstream medical education. Future models with higher accuracy are needed for the platform to be effective in medical education.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>board examinations</kwd>
        <kwd>ophthalmology</kwd>
        <kwd>testing</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Recently, advances in artificial intelligence (AI) models, more specifically natural language processing (NLP), led to the development of large language models (LLMs) that have shown remarkable performance on a variety of tasks [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. ChatGPT is among the most popular of these models. It was developed by OpenAI and has had several version updates since its inception. GPT-3.5 was among the earlier versions developed, followed by GPT-4.0, developed on March 15, 2023, as a more robust, concise, and intelligent model. ChatGPT has become quite famous for its outstanding ability to answer questions and assist in many tasks [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Medical education relies highly on standardized multiple-choice examinations to test medical students in an objective and consistent way. Ophthalmologists in the United States pass through the United States Medical Licensing Examination (USMLE) steps 1, 2, and 3, the Ophthalmic Knowledge Assessment Program (OKAP), and the Board of Ophthalmology (OB) Written Qualifying Examination (WQE) by the time they become practicing physicians. Undergraduate and graduate medical students rely on different tools available to prepare for these examinations.</p>
      <p>One limitation of the current tools for medical education is the lack of personalization. Question banks used today do not tailor their explanations to users; rather, they present one explanation for each question to all its users. ChatGPT and other LLMs, if proven to be accurate in their ability to answer questions, can provide robust explanations to users, and users can then ask specific questions they need further clarification on. This can be very helpful and educational for users as it can tailor to the needs of each user and help them fill specific knowledge gaps they may have. Additionally, the GPT-3.5 model is freely available to everyone, while GPT-4.0 is available at a premium. As such, it is essential to compare these models to assess whether GPT-4.0’s hypothetical increased abilities justify the price of the membership.</p>
      <p>The question of how ChatGPT can be integrated for use in medical education has emerged. With the complexity of ophthalmology, the ability of ChatGPT to accurately answer ophthalmology questions could be of significant value to medical students and residents preparing for the USMLE, OKAP, and OB-WQE. It is also important to compare the performance of both GPT-4.0 and GPT-3.5, since GPT-4.0 is marketed as a more intelligent version of its predecessor.</p>
      <p>Therefore, the aim of this study is to evaluate the performance of ChatGPT on ophthalmology questions from USMLE steps 1, 2, and 3, the OKAP, and the OB-WQE using both GPT-3.5 and GPT-4.0. We hypothesize that ChatGPT’s responses are comparable to those of human experts in the field, and that GPT-4.0 performs better than GPT-3.5. The results of this study could have implications for the future use of ChatGPT in medical education and training, and for the development of more efficient and effective tools for examination preparation.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Sets</title>
        <p>Different data sets were used for the different examinations due to the lack of a central service for all examinations. Questions that included pictures or tables were automatically excluded and were not queried on ChatGPT. AMBOSS [<xref ref-type="bibr" rid="ref5">5</xref>], a question bank and popular resource for the USMLE was used for steps 1, 2, and 3. A total of 44 questions were included for step 1, 60 for step 2, and 28 for step 3. AMBOSS highlights the difficulty of each question and the percentage of people who chose each answer choice. This allowed us to compare the performance of ChatGPT to the general population [<xref ref-type="bibr" rid="ref5">5</xref>]. For the OKAP and OB-WQE, 248 questions across the different chapters were taken from <italic>Ophthalmology Board Review Q&#38;A</italic> by Glass et al [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      </sec>
      <sec>
        <title>Prompt Engineering</title>
        <p>The style and the prompt of the questions asked to ChatGPT have been shown to have an impact on the answer given. To standardize the process of asking the questions to ChatGPT, questions were all formatted in the same way on Word (Microsoft Corp). After removing questions with pictures or tables, the questions were formatted in the manner described by Gilson et al [<xref ref-type="bibr" rid="ref7">7</xref>]. The question stem was consolidated in 1 paragraph, and then each answer choice was placed on a separate line. Furthermore, the answer choices were separated by 2 empty lines from the main question stem; this was done to optimize the accuracy of the results, avoiding any effect the question format may have on ChatGPT’s ability. An example prompt is shown in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>An example of a prompt (written by the authors).</title>
          <p>Question: What medical discipline deals with conditions of the eye</p>
          <p>A. Dermatology</p>
          <p>B. Endocrinology</p>
          <p>C. Ophthalmology</p>
          <p>D. Rheumatology</p>
        </boxed-text>
      </sec>
      <sec>
        <title>Question Input</title>
        <p>All questions were input in ChatGPT on March 5, 2023, for GPT-3.5 and April 15, 2023, for GPT-4.0. We then used Excel (Microsoft Corp) spreadsheets to record whether the answer was correct or not, the percentage of users getting the answer correct (if applicable), the difficulty level (if applicable), and the topic (if applicable).</p>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <p>Data analysis was conducted using both Python (Python Software Foundation) and Excel. Excel was used to determine the percentage of correct answers. Python (Python Anaconda Spyder 5.3.3) was used to determine the percentage of correct answers by difficulty, test type, and topic. A chi-square test was conducted on Python to determine whether there are any significant differences in answering correctly based on test type and difficulty. Python was also used to compute the coefficient of correlation (and <italic>P</italic> value) between ChatGPT answering correctly and the percentage of users who got the correct answer. Point-biserial was used to compute the correlation between ChatGPT answering questions correctly and humans answering correctly. Other tests included chi-square analysis and the Fisher exact test to investigate relationships between 2 categorical variables (difficulty level, correct or incorrect answers, etc).</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Since this study does not involve any human participants, institutional review board approval is not necessary for the purpose of this study. This study also respects the rights and copyright of the owners of the resources used and has obtained their approval for using the questions without sharing the questions anywhere in the data or paper.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>A total of 380 questions were queried on ChatGPT. The number of questions for each examination were 44 for step 1, 60 for step 2, 28 for step 3, and 248 for the OKAP and OB-WQE. The total percentage of correct answers was 55% (n=210) across all examinations for GPT-3.5, while it was 70% (n=270) for GPT-4.0. <xref ref-type="table" rid="table1">Table 1</xref> shows the number and percentage of correct answers for each examination by each GPT model.</p>
      <p>Between GPT-3.5 and GPT-4.0, GPT-4.0 performed significantly better on USMLE steps 2 and 3 and the OB-WQE but not on USMLE step 1. While GPT-3.5’s performance decreased with an increase in the examination level (<italic>P</italic>&#60;.001), GPT-4.0 performed better on USMLE steps 2 and 3 and poorer on the OB-WQE and USMLE step 1. The coefficient of correlation (<italic>r</italic>) between ChatGPT answering correctly and the percentage of humans answering correctly on AMBOSS was 0.21 (<italic>P</italic>=.01) for GPT-3.5 and –0.31 (<italic>P</italic>&#60;.001) for GPT-4.0.</p>
      <p><xref ref-type="table" rid="table2">Table 2</xref> highlights the percentage of correct questions based on the difficulty level in the AMBOSS questions and in the OB-WQE questions.</p>
      <p><xref ref-type="table" rid="table3">Table 3</xref> highlights the performance of both models according to the different topics in the OB-WQE and OKAP questions. Performance for both models was nonrandom, with both models performing better on certain topics such as corneal diseases, pediatrics, retina, ocular oncology, and neuro-ophthalmology.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Performance of GPT-3.5 and GPT-4.0 on various examinations.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <thead>
            <tr valign="top">
              <td>Examination</td>
              <td colspan="2">Correct answers provided by models<sup>a</sup>, n (%)</td>
              <td><italic>P</italic> value</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>GPT-3.5</td>
              <td>GPT-4.0</td>
              <td>
                <break/>
              </td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>USMLE<sup>b</sup> step 1</td>
              <td>33 (75)</td>
              <td>31 (70.45)</td>
              <td>.81</td>
            </tr>
            <tr valign="top">
              <td>USMLE step 2</td>
              <td>44 (73.33)</td>
              <td>56 (90.32)</td>
              <td>.01</td>
            </tr>
            <tr valign="top">
              <td>USMLE step 3</td>
              <td>17 (60.71)</td>
              <td>27 (96.43)</td>
              <td>.004</td>
            </tr>
            <tr valign="top">
              <td>OB-WQE<sup>c</sup></td>
              <td>116 (46.77)</td>
              <td>156 (62.90)</td>
              <td>&#60;.001</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup><italic>P</italic>&#60;.001 for between-model comparisons in the proportion of correct answers.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>USMLE: United States Medical Licensing Examination.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>OB-WQE: Board of Ophthalmology Written Qualifying Examination.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Performance of GPT-3.5 and GPT-4.0 according to different difficulty levels.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="120"/>
          <col width="100"/>
          <col width="80"/>
          <col width="100"/>
          <col width="0"/>
          <col width="100"/>
          <col width="0"/>
          <col width="0"/>
          <col width="120"/>
          <col width="0"/>
          <col width="100"/>
          <col width="0"/>
          <col width="80"/>
          <col width="100"/>
          <col width="100"/>
          <thead>
            <tr valign="top">
              <td colspan="8">GPT-4.0</td>
              <td colspan="7">GPT-3.5</td>
            </tr>
            <tr valign="top">
              <td>Board of Ophthalmology difficulty level</td>
              <td>Correct answers<sup>a</sup>, n (%)</td>
              <td colspan="5">AMBOSS<sup>b</sup></td>
              <td colspan="3">Board of Ophthalmology difficulty level</td>
              <td colspan="2">Correct answers<sup>c</sup>, n (%)</td>
              <td colspan="3">AMBOSS<sup>d</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Difficulty level</td>
              <td>ChatGPT’s performance (correct answers), n (%)</td>
              <td colspan="2">Human performance (correct answers), %</td>
              <td colspan="3">
                <break/>
              </td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">Difficulty level</td>
              <td>ChatGPT’s performance (correct answers), n (%)</td>
              <td>Human performance (correct answers), %</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>1</td>
              <td>49 (76)</td>
              <td>1</td>
              <td>19 (100)</td>
              <td colspan="2">83</td>
              <td colspan="3">1</td>
              <td colspan="2">34 (53)</td>
              <td colspan="2">1</td>
              <td>14 (88)</td>
              <td>83</td>
            </tr>
            <tr valign="top">
              <td>2</td>
              <td>73 (59)</td>
              <td>2</td>
              <td>43 (91)</td>
              <td colspan="2">68</td>
              <td colspan="3">2</td>
              <td colspan="2">54 (44.26)</td>
              <td colspan="2">2</td>
              <td>36 (77)</td>
              <td>68</td>
            </tr>
            <tr valign="top">
              <td>3</td>
              <td>35 (56)</td>
              <td>3</td>
              <td>38 (84)</td>
              <td colspan="2">53</td>
              <td colspan="3">3</td>
              <td colspan="2">28 (45.16)</td>
              <td colspan="2">3</td>
              <td>28 (63)</td>
              <td>53</td>
            </tr>
            <tr valign="top">
              <td>N/A<sup>e</sup></td>
              <td>N/A</td>
              <td>4</td>
              <td>10 (59)</td>
              <td colspan="2">37</td>
              <td colspan="3">N/A</td>
              <td colspan="2">N/A</td>
              <td colspan="2">4</td>
              <td>12 (60)</td>
              <td>37</td>
            </tr>
            <tr valign="top">
              <td>N/A</td>
              <td>N/A</td>
              <td>5</td>
              <td>4 (66.67)</td>
              <td colspan="2">26</td>
              <td colspan="3">N/A</td>
              <td colspan="2">N/A</td>
              <td colspan="2">5</td>
              <td>3 (50)</td>
              <td> 26</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup><italic>P</italic>=.04 on comparing the performance of GPT-4.0 across different difficulty levels.</p>
          </fn>
          <fn id="table2fn2">
            <p><sup>b</sup><italic>P</italic>=.003 on comparing the performance of GPT-4.0 across different difficulty levels.</p>
          </fn>
          <fn id="table2fn3">
            <p><sup>c</sup><italic>P</italic>=.49 on comparing the performance of GPT-3.5 across different difficulty levels.</p>
          </fn>
          <fn id="table2fn4">
            <p><sup>d</sup><italic>P</italic>=.18 on comparing the performance of GPT-3.5 across different difficulty levels.</p>
          </fn>
          <fn id="table2fn5">
            <p><sup>e</sup>N/A: not applicable.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Performance of GPT-3.5 and GPT-4.0 on various included topics.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="290"/>
          <col width="150"/>
          <col width="290"/>
          <col width="150"/>
          <col width="120"/>
          <thead>
            <tr valign="top">
              <td>Category</td>
              <td>Correct answers by GPT-4.0<sup>a</sup>, n (%)</td>
              <td>Topic</td>
              <td>Correct answers by GPT-3.5<sup>b</sup>, n (%)</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Cornea, external disease, and anterior segment</td>
              <td>28 (74)</td>
              <td>Cornea, external disease, and anterior segment</td>
              <td>25 (66)</td>
              <td>.45</td>
            </tr>
            <tr valign="top">
              <td>Glaucoma</td>
              <td>20 (61)</td>
              <td>Glaucoma</td>
              <td>16 (48)</td>
              <td>.32</td>
            </tr>
            <tr valign="top">
              <td>Lens and cataract</td>
              <td>22 (88)</td>
              <td>Lens and cataract</td>
              <td>8 (32)</td>
              <td>&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>Neuro-ophthalmology</td>
              <td>15 (54)</td>
              <td>Neuro-ophthalmology</td>
              <td>16 (57)</td>
              <td>.06</td>
            </tr>
            <tr valign="top">
              <td>Oculofacial, plastics, and orbit</td>
              <td>17 (50)</td>
              <td>Oculofacial, plastics, and orbit</td>
              <td>10 (29)</td>
              <td>.08</td>
            </tr>
            <tr valign="top">
              <td>Pediatric ophthalmology and strabismus</td>
              <td>14 (61)</td>
              <td>Pediatric ophthalmology and strabismus</td>
              <td>9 (34)</td>
              <td>.07</td>
            </tr>
            <tr valign="top">
              <td>Refractive management and optics</td>
              <td>17 (50)</td>
              <td>Refractive management and optics</td>
              <td>14 (41)</td>
              <td>.46</td>
            </tr>
            <tr valign="top">
              <td>Retina and ocular oncology</td>
              <td>24 (73)</td>
              <td>Retina and ocular oncology</td>
              <td>18 (54)</td>
              <td>.12</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup><italic>P</italic>=.02 for differences in the number of correct answers provided by GPT-4.0 among different categories.</p>
          </fn>
          <fn id="table3fn2">
            <p><sup>b</sup><italic>P</italic>=.03 for differences in the number of correct answers provided by GPT-3.5 among different topics.</p>
          </fn>
          <fn id="table3fn3">
            <p><sup>c</sup>Significant at <italic>P</italic>&#60;.05.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our results indicate that GPT-4.0 is superior to GPT-3.5, and that GPT-3.5 has a below-average accuracy in answering questions correctly. The total proportion of correct answers for GPT-3.5 was 55% (n=210), which is considered a poor performance, while that of GPT-4.0 was 70% (n=270), which is an almost average performance [<xref ref-type="bibr" rid="ref7">7</xref>]. Students typically must achieve 59%-60% of correct answers to pass, and students perform with an average of around 70%-75% on the aforementioned board examinations [<xref ref-type="bibr" rid="ref7">7</xref>]. It is interesting to note that GPT-3.5’s performance decreased as examination levels increased. This is probably due to the more clinical nature of the examinations. This was not the case for GPT-4.0, which performed best on USMLE steps 2 and 3.</p>
        <p>This study investigates the correlation between ChatGPT-3.5 and -4.0 providing a correct answer and the percentage of human users who provided the answer correctly on AMBOSS. For GPT-3.5, a correlation coefficient of 0.21 (<italic>P</italic>=.01) was noted; whereas, this correlation coefficient was –0.31 (<italic>P</italic>&#60;.001) for GPT-4.0. This implies that GPT-4.0 performed better on questions that fewer users answered correctly.</p>
        <p>Although our study is limited in that it did not divide the questions into categories such as diagnosis, treatment, basic knowledge, or surgical planning questions. Looking closely at the lens and cataract section in which the model failed (32% of correct answers for GPT-3.5), it was noted that all the correct answers were basic knowledge questions. Surprisingly, an analysis of incorrect answers showed that almost half of the incorrectly answered questions were also basic knowledge questions. For instance, in one of the questions, the model was unable to identify the collagen fiber type in cataract—a piece of information that is widely available on the internet.</p>
        <p>On the other hand, GPT-4.0 performed significantly better on basic knowledge questions. One may postulate that since GPT-4.0 was fed a larger database than was GPT-3.5, it has better abilities in answering basic knowledge questions than GPT-3.5. A study by Taloni et al [<xref ref-type="bibr" rid="ref8">8</xref>] also noted a significant difference in performance between the 2 models in the cataract and anterior segment diseases categories.</p>
        <p>It is unclear why it performed so poorly in the lens and cataract section. It could be hypothesized that managing diseases of the lens and cataract may be mostly surgical. This may not have been fed into this language learning model. Furthermore, surgical management requires input from images and videos, which were excluded from our paper and may have caused the drastic difference in performance. Further studies with more questions are needed to answer this question.</p>
        <p><xref ref-type="table" rid="table2">Table 2</xref> outlines the percentage of correct answers based on the difficulty level on both models. GPT-4.0 performed poorer on questions with greater difficulties on both AMBOSS and OB-WQE questions, whereas this observation was not significant in GPT-3.5, indicating that it performed almost equally well across difficulty levels. Gilson et al [<xref ref-type="bibr" rid="ref7">7</xref>] also reported a similar finding for GPT-3.5. Further studies are needed to explain those findings.</p>
        <p>This study also examined the proportion of correct answers based on the different topics. Both models performed significantly better on certain topics than others. This is a novel finding not reported in other studies assessing the performance of ChatGPT. It is interesting to further explore this association and why a model would perform on certain topics better than others. It could be hypothesized that questions on topics such as oculoplastic, which rely on surgical techniques and knowledge of aesthetics, may be more difficult for AI models to answer correctly than topics such as oncology and pathology, which rely more on clinical knowledge. Taloni et al [<xref ref-type="bibr" rid="ref8">8</xref>] reported a better performance of ChatGPT on clinical rather than surgical cases.</p>
        <p>The moderate accuracy of ChatGPT-3.5 has been widely replicated in various studies. Gilson et al [<xref ref-type="bibr" rid="ref7">7</xref>] found accuracies ranging between 42% and 64.4% in USMLE steps 1 and 2 examinations, numbers similar to those noted in this study [<xref ref-type="bibr" rid="ref7">7</xref>]. The paper also records a decrease in the proportion of correct answers as difficulty level increases, which has been noted in this study as well. Another study by Huh [<xref ref-type="bibr" rid="ref9">9</xref>] showed that ChatGPT’s performance was significantly lower than that of Korean medical students in a parasitology examination. A letter to the editor of the journal <italic>Resuscitation</italic> revealed that ChatGPT did not reach the passing threshold for the Life Support examination [<xref ref-type="bibr" rid="ref10">10</xref>]. The cited studies indicate the moderate capabilities of ChatGPT in answering clinically related questions. More studies are needed to show how we can best optimize ChatGPT for medical education. Mihalache et al [<xref ref-type="bibr" rid="ref11">11</xref>] assessed the performance of ChatGPT on the OKAP and found that it provided 46% correct answers, not unlike the proportion of OB-WQE questions correctly answered by GPT-3.5 in this study. All the aforementioned studies used ChatGPT-3.5 in their analysis. More recent studies have assessed the efficacy of ChatGPT-4.0. A study by Lim et al [<xref ref-type="bibr" rid="ref12">12</xref>] assessed the performance of GPT-4.0 on myopia-related questions, and the model performed with 80.6% adequate responses, compared to 61.3% for GPT-3.5. Taloni et al [<xref ref-type="bibr" rid="ref8">8</xref>] assessed the use of ChatGPT-4.0 and ChatGPT-3.5 in the American Academy of Ophthalmology’s self-assessment questions; their study found that GPT-4.0 (82.4% of correct answers) performed better than both humans (75.7% of correct answers) and GPT-3.5 (65.9% of correct answers). The study also assessed the performance of these models across various topics [<xref ref-type="bibr" rid="ref8">8</xref>]. Similar to our results, Taloni et al [<xref ref-type="bibr" rid="ref8">8</xref>] found that ChatGPT performed better on ocular oncology and pathology compared to topics such as strabismus and pediatric ophthalmology. To our knowledge, our study is among the first few to assess the abilities of GPT-4.0 in medical examinations across various levels of education and various board examinations.</p>
        <p>When reviewing the explanations provided by ChatGPT, it was noted that the model would randomly either explain the provided answer choice or not. It is particularly remarkable to read how it justified the wrong answer choices. More studies are needed to emphasize and assess the answer justifications of the model. Indeed, having solid explanations is essential for it to become a reliable medical education tool.</p>
        <p>Our study is unique in that it assesses the capabilities of ChatGPT in answering ophthalmology-related questions in contrast to other studies that assessed its ability to succeed in general examinations such as USMLE steps 1 and 2. Furthermore, this is the first study to assess the ability of ChatGPT to answer questions of a certain discipline across all its examination levels. Finally, this is among the first studies to compare GPT-4.0’s performance to GPT-3.5’s performance in medical examinations.</p>
        <p>ChatGPT can be a great add-on to mainstream resources to study for board examinations. There have been reports of using it to generate clinical vignettes and board examination–like questions, which can create more unique practice opportunities for students. Additionally, our study also assesses the accuracy of the 2 models on board examination questions related to ophthalmology. Students can input questions they need help with on the platform, and receive an answer and explanation by using the platform. If the student is not satisfied with the answer provided, or has further questions, he or she can respond to the model and receive a more personalized answer. This is crucial as it significantly decreases the time needed to study and also creates a tailored study experience for each student’s needs.</p>
        <p>However, ChatGPT needs further optimization before it can be considered a mainstream tool for medical education. The image feature was not present in GPT-3.5 and was introduced in GPT-4.0. This feature is available only on demand and is yet to be available to all users. Its accuracy and reliability are yet to be established for examination purposes. Many questions were excluded due to them containing images, which is a considerable limitation considering the visual nature of ophthalmology. Even in the text-only questions, ChatGPT had moderate accuracy in answering questions across different difficulties and levels. This study is, however, limited by the small number of questions, particularly in the USMLE steps, due to the absence of a large number of ophthalmology questions in the resources used to prepare for these examinations. More studies are needed, which input a larger number of questions. This study also does not assess the repeatability of ChatGPT’s answers; however, a study by Antaki et al [<xref ref-type="bibr" rid="ref13">13</xref>] reported near-perfect repeatability.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Overall, this study suggests that ChatGPT has moderate accuracy in answering questions. Its accuracy decreases in nature as the examinations become more advanced and more clinical in nature. In its current state, ChatGPT does not seem to be the ideal medium for medical education and preparation for board examinations. Future models with more robust capabilities may soon become part of mainstream medical education. More studies are needed, which input a larger number of questions to verify the results of this study and attempt to find explanations for many of the intriguing findings.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">OB</term>
          <def>
            <p>Board of Ophthalmology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">OKAP</term>
          <def>
            <p>Ophthalmic Knowledge Assessment Program</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">WQE</term>
          <def>
            <p>Written Qualifying Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We thank AMBOSS and Thieme Publishers for granting access to the questions for use in this present study. All authors declared that they had insufficient or no funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee (APF) support for the publication of this article.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gozalo-Brizuela</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Garrido-Merchan</surname>
              <given-names>EC</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT is not all you need. A atate of the art review of large generative AI models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online January 11, 2023. </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2301.04655</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Castelvecchi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Are ChatGPT and AlphaCode going to replace programmers?</article-title>
          <source>Nature</source>
          <year>2022</year>
          <month>12</month>
          <day>08</day>
          <pub-id pub-id-type="doi">10.1038/d41586-022-04383-z</pub-id>
          <pub-id pub-id-type="medline">36481949</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-022-04383-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeblick</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schachtner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dexl</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mittermeier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stüber</surname>
              <given-names>Anna Theresa</given-names>
            </name>
            <name name-style="western">
              <surname>Topalis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wesp</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sabel</surname>
              <given-names>BO</given-names>
            </name>
            <name name-style="western">
              <surname>Ricke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ingrisch</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title>
          <source>Eur Radiol</source>
          <year>2023</year>
          <month>10</month>
          <day>05</day>
          <pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id>
          <pub-id pub-id-type="medline">37794249</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-023-10213-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azaria</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT usage and limitations</article-title>
          <source>OSF Preprints</source>
          <comment>Preprint posted online December 27, 2022</comment>
          <pub-id pub-id-type="doi">10.31219/osf.io/5ue7n</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>Powerful learning and clinical tools combined into one platform</article-title>
          <source>AMBOSS</source>
          <access-date>2023-03-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.amboss.com/">https://www.amboss.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Bottini</surname>
              <given-names>AR</given-names>
            </name>
          </person-group>
          <source>Graefes Arch Clin Exp Ophthalmol</source>
          <year>2021</year>
          <month>07</month>
          <day>15</day>
          <volume>259</volume>
          <issue>8</issue>
          <fpage>2457</fpage>
          <lpage>2458</lpage>
          <pub-id pub-id-type="doi">10.1007/s00417-021-05094-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taloni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Borselli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Scarsi</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Rossi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Coco</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Scorcia</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Giannaccare</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Comparative performance of humans versus GPT-4.0 and GPT-3.5 in the self-assessment program of American Academy of Ophthalmology</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <month>10</month>
          <day>29</day>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>18562</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-45837-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-45837-2</pub-id>
          <pub-id pub-id-type="medline">37899405</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-45837-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10613606</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Are ChatGPT’s knowledge and interpretation ability comparable to those of medical students in Korea for taking a parasitology examination?: a descriptive study</article-title>
          <source>J Educ Eval Health Prof</source>
          <year>2023</year>
          <volume>20</volume>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36627845"/>
          </comment>
          <pub-id pub-id-type="doi">10.3352/jeehp.2023.20.1</pub-id>
          <pub-id pub-id-type="medline">36627845</pub-id>
          <pub-id pub-id-type="pii">jeehp.2023.20.1</pub-id>
          <pub-id pub-id-type="pmcid">PMC9905868</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fijačko</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gosak</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Štiglic</surname>
              <given-names>Gregor</given-names>
            </name>
            <name name-style="western">
              <surname>Picard</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>John Douma</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT pass the life support exams without entering the American heart association course?</article-title>
          <source>Resuscitation</source>
          <year>2023</year>
          <month>04</month>
          <volume>185</volume>
          <fpage>109732</fpage>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109732</pub-id>
          <pub-id pub-id-type="medline">36775020</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00045-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mihalache</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Popovic</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Muni</surname>
              <given-names>RH</given-names>
            </name>
          </person-group>
          <article-title>Performance of an artificial intelligence chatbot in ophthalmic knowledge assessment</article-title>
          <source>JAMA Ophthalmol</source>
          <year>2023</year>
          <month>06</month>
          <day>01</day>
          <volume>141</volume>
          <issue>6</issue>
          <fpage>589</fpage>
          <lpage>597</lpage>
          <pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.1144</pub-id>
          <pub-id pub-id-type="medline">37103928</pub-id>
          <pub-id pub-id-type="pii">2804364</pub-id>
          <pub-id pub-id-type="pmcid">PMC10141269</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>ZW</given-names>
            </name>
            <name name-style="western">
              <surname>Pushpanathan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yew</surname>
              <given-names>SME</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>JSH</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>DZ</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>JHL</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>MCJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sheng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Koh</surname>
              <given-names>VTC</given-names>
            </name>
            <name name-style="western">
              <surname>Tham</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking large language models' performances for myopia care: a comparative analysis of ChatGPT-3.5, ChatGPT-4.0, and Google Bard</article-title>
          <source>EBioMedicine</source>
          <year>2023</year>
          <month>09</month>
          <volume>95</volume>
          <fpage>104770</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2352-3964(23)00336-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ebiom.2023.104770</pub-id>
          <pub-id pub-id-type="medline">37625267</pub-id>
          <pub-id pub-id-type="pii">S2352-3964(23)00336-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10470220</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Antaki</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Touma</surname>
              <given-names>Samir</given-names>
            </name>
            <name name-style="western">
              <surname>Milad</surname>
              <given-names>Daniel</given-names>
            </name>
            <name name-style="western">
              <surname>El-Khoury</surname>
              <given-names>Jonathan</given-names>
            </name>
            <name name-style="western">
              <surname>Duval</surname>
              <given-names>Renaud</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the performance of ChatGPT in ophthalmology: an analysis of its successes and shortcomings</article-title>
          <source>Ophthalmol Sci</source>
          <year>2023</year>
          <month>12</month>
          <volume>3</volume>
          <issue>4</issue>
          <fpage>100324</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-9145(23)00056-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.xops.2023.100324</pub-id>
          <pub-id pub-id-type="medline">37334036</pub-id>
          <pub-id pub-id-type="pii">S2666-9145(23)00056-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC10272508</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
