<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i1e52818</article-id>
      <article-id pub-id-type="pmid">39042876</article-id>
      <article-id pub-id-type="doi">10.2196/52818</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Appraisal of ChatGPT’s Aptitude for Medical Education: Comparative Analysis With Third-Year Medical Students in a Pulmonology Examination</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Sapci, MD</surname>
            <given-names>A. Hasan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mondal</surname>
            <given-names>Himel</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Lingxuan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Cherif</surname>
            <given-names>Hela</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Faculté de Médecine de Tunis</institution>
            <institution>Université de Tunis El Manar</institution>
            <addr-line>15, Rue Djebel Lakhdhar – Bab Saadoun</addr-line>
            <addr-line>Tunis, 1007</addr-line>
            <country>Tunisia</country>
            <phone>216 50424534</phone>
            <email>hela.cherif@fmt.utm.tn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2086-918X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Moussa</surname>
            <given-names>Chirine</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8123-9843</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Missaoui</surname>
            <given-names>Abdel Mouhaymen</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2486-4597</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Salouage</surname>
            <given-names>Issam</given-names>
          </name>
          <degrees>Prof Dr</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-8658-9852</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Mokaddem</surname>
            <given-names>Salma</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7775-3073</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Dhahri</surname>
            <given-names>Besma</given-names>
          </name>
          <degrees>Prof Dr</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-0789-940X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Faculté de Médecine de Tunis</institution>
        <institution>Université de Tunis El Manar</institution>
        <addr-line>Tunis</addr-line>
        <country>Tunisia</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hela Cherif <email>hela.cherif@fmt.utm.tn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>23</day>
        <month>7</month>
        <year>2024</year>
      </pub-date>
      <volume>10</volume>
      <elocation-id>e52818</elocation-id>
      <history>
        <date date-type="received">
          <day>29</day>
          <month>9</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>5</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>26</day>
          <month>2</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Hela Cherif, Chirine Moussa, Abdel Mouhaymen Missaoui, Issam Salouage, Salma Mokaddem, Besma Dhahri. Originally published in JMIR Medical Education (https://mededu.jmir.org), 23.07.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2024/1/e52818" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The rapid evolution of ChatGPT has generated substantial interest and led to extensive discussions in both public and academic domains, particularly in the context of medical education.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to evaluate ChatGPT’s performance in a pulmonology examination through a comparative analysis with that of third-year medical students.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>In this cross-sectional study, we conducted a comparative analysis with 2 distinct groups. The first group comprised 244 third-year medical students who had previously taken our institution’s 2020 pulmonology examination, which was conducted in French. The second group involved ChatGPT-3.5 in 2 separate sets of conversations: without contextualization (V1) and with contextualization (V2). In both V1 and V2, ChatGPT received the same set of questions administered to the students.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>V1 demonstrated exceptional proficiency in radiology, microbiology, and thoracic surgery, surpassing the majority of medical students in these domains. However, it faced challenges in pathology, pharmacology, and clinical pneumology. In contrast, V2 consistently delivered more accurate responses across various question categories, regardless of the specialization. ChatGPT exhibited suboptimal performance in multiple choice questions compared to medical students. V2 excelled in responding to structured open-ended questions. Both ChatGPT conversations, particularly V2, outperformed students in addressing questions of low and intermediate difficulty. Interestingly, students showcased enhanced proficiency when confronted with highly challenging questions. V1 fell short of passing the examination. Conversely, V2 successfully achieved examination success, outperforming 139 (62.1%) medical students.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>While ChatGPT has access to a comprehensive web-based data set, its performance closely mirrors that of an average medical student. Outcomes are influenced by question format, item complexity, and contextual nuances. The model faces challenges in medical contexts requiring information synthesis, advanced analytical aptitude, and clinical judgment, as well as in non-English language assessments and when confronted with data outside mainstream internet sources.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>medical education</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>GPT</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>pulmonary medicine</kwd>
        <kwd>pulmonary</kwd>
        <kwd>lung</kwd>
        <kwd>lungs</kwd>
        <kwd>respiratory</kwd>
        <kwd>respiration</kwd>
        <kwd>pneumology</kwd>
        <kwd>comparative analysis</kwd>
        <kwd>large language models</kwd>
        <kwd>LLMs</kwd>
        <kwd>LLM</kwd>
        <kwd>language model</kwd>
        <kwd>generative AI</kwd>
        <kwd>generative artificial intelligence</kwd>
        <kwd>generative</kwd>
        <kwd>exams</kwd>
        <kwd>exam</kwd>
        <kwd>examinations</kwd>
        <kwd>examination</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Artificial intelligence (AI) has emerged as a transformative force across various aspects of modern life. Within the realm of AI, natural language processing (NLP) has gained significant attention as it involves the use of devices to replicate human cognitive processes, encompassing learning, problem-solving, and practical application [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. An exemplary NLP model is ChatGPT, developed by OpenAI. This model uses deep learning algorithms trained on extensive data sets to generate responses simulating human-like interactions. This versatile dialogic agent holds promise in diverse applications, including customer service and chatbots [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Launched on November 30, 2022, ChatGPT quickly gained popularity, attracting a million users within its first week and achieving unprecedented growth. In June 2023 alone, the ChatGPT website received 1.66 billion visits, underscoring its widespread appeal and use [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      <p>While this rapid development of ChatGPT has generated both excitement and concern across various fields, the impact on medical education has been particularly intriguing [<xref ref-type="bibr" rid="ref7">7</xref>]. This chatbot technology may present opportunities to revolutionize medical education, offering enhanced efficiency, interactivity, and realism in training scenarios [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. However, these benefits come with significant challenges and uncertainties that need to be carefully addressed and navigated [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p>
      <p>A paramount examination in the medical school curriculum is the pneumology examination. This pivotal assessment evaluates the comprehensive understanding of respiratory diseases and their management—a core competency for any medical practitioner.</p>
      <p>Our study aims to evaluate the performance of ChatGPT in the context of pneumology examinations through a comparative analysis with that of third-year medical students.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design and Participants</title>
        <p>This research adopts a cross-sectional design and was conducted at the pneumology teaching section of the Faculty of Medicine of Tunis (FMT), Tunisia, in June 2023. The study uses a comparative approach, involving 2 distinct groups: ChatGPT and medical students.</p>
        <p>The first group comprises 244 third-year medical students registered at the FMT. These students had previously taken the pulmonology examination in January 2020. The second group consists of ChatGPT-3.5, a freely available version of ChatGPT, which undertook the same pneumology examination in June 2023.</p>
      </sec>
      <sec>
        <title>Pneumology Examination</title>
        <sec>
          <title>Question Selection</title>
          <p>The pneumology examination of FMT of 2020 is a 90-minute test comprising 50 questions, written in French. These questions underwent validation within the pneumology section of FMT to cover a diverse range of knowledge levels, including both fundamental and advanced concepts. The examination assesses candidates’ competency in various fields of pneumology, such as clinical pneumology, microbiology, respiratory radiology, pharmacology, pathology, and thoracic surgery.</p>
          <p>The administered version of the examination involved only 45 text-based questions to align with ChatGPT’s processing capabilities. Thus, 5 questions based on visual components (images, graphs, and illustrations) were excluded since ChatGPT lacks the ability to process this material within its conversational scope.</p>
          <p>A comprehensive mapping of assessment parameters for the administered pneumology examination is presented in <xref ref-type="table" rid="table1">Table 1</xref>. It encompasses a total of 9 multiple choice questions (MCQs), 13 short open-ended questions (SOEQs), and 7 clinical scenarios. Among the clinical scenarios, 2 were structured with MCQs, while the remaining 5 were constructed with SOEQs.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Assessment parameters and question distribution in pneumology examination.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="670"/>
              <col width="300"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Mapping of pulmonology examination</td>
                  <td>Findings</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="3">
                    <bold>Parameters</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Academic year</td>
                  <td>2020</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Target examinees</td>
                  <td>Third-year medical students</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Timing</td>
                  <td>90 minutes</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Grading scale</td>
                  <td>0-100</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Questions, n</td>
                  <td>45</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">
                    <bold>Question topics, n (%)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Clinical pneumology</td>
                  <td>27 (60)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Radiology</td>
                  <td>7 (16)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Pharmacology</td>
                  <td>5 (11)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Pathology</td>
                  <td>3 (7)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Microbiology</td>
                  <td>2 (4)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Thoracic surgery</td>
                  <td>1 (2)</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">
                    <bold>Question formats, n (%)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Independent MCQs<sup>a</sup></td>
                  <td>9 (20)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Independent SOEQs<sup>b</sup></td>
                  <td>13 (29)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>MCQ-structured clinical cases</td>
                  <td>7 (16)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SOEQ-structured clinical cases</td>
                  <td>16 (35)</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">
                    <bold>Distribution by difficulty index, n (%)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Low difficulty index items</td>
                  <td>12 (27)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Intermediate difficulty index items</td>
                  <td>25 (56)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>High difficulty index items</td>
                  <td>8 (18)</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">
                    <bold>Distribution by discrimination index, n (%)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Low discrimination index items</td>
                  <td>21 (47)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Intermediate discrimination index items</td>
                  <td>13 (29)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>High discrimination index items</td>
                  <td>11 (24)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>MCQ: multiple choice question.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>SOEQ: short answer open-ended question.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Item Performance Indexes</title>
          <p>Item performance indexes are crucial statistical measures used to assess the effectiveness and quality of test questions, ensuring the reliability and validity of the assessment. These indexes provide valuable insights into the performance of each item concerning difficulty level, discrimination, and its ability to differentiate between high- and low-performing students. In this study, we used common item performance indexes, including the difficulty index (D1) and the discrimination index (D2) [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>].</p>
          <p>The D1 represents the proportion of students who answered an item correctly, calculated by dividing the number of correct responses by the total number of students attempting the item. While the optimal item difficulty may vary based on the specific test format and intended learning outcomes, a value within the 0.3 to 0.7 range is generally preferred [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
          <p>On the other hand, the D2 measures an item’s capability to differentiate between high-performing and low-performing students. It is determined by comparing the performance of students who achieved high scores on the overall test with those who scored low on the same test for a particular item. D2 levels are classified as follows: high discrimination (D2&#62;0.7), intermediate discrimination (D2 values between 0.3 and 0.7), and low discrimination (D2&#60;0.3) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Data Collection and Score System</title>
        <p>The database, containing the results and scores of medical students who took the pneumology examination in 2020, along with corresponding performance indexes, was accessible in the pneumology section and used in our comparative analysis.</p>
        <p>Two authors (HC and CM) conducted separate conversations with ChatGPT-3.5. In the first conversation, CM presented questions to the chatbot without contextualization (V1). In the second conversation, conducted by HC, suitable context was provided before posing the questions (V2). The questions were presented in exactly the same order as given to the students. <xref rid="figure1" ref-type="fig">Figures 1</xref> and <xref rid="figure2" ref-type="fig">2</xref> show illustrations of the dual chat conversations conducted by HC and CM and the respective responses from ChatGPT.</p>
        <p>The responses generated by both V1 and V2 were meticulously transcribed and stored in separate files. To ensure objectivity and independence, an impartial pneumology teacher, not involved in this study, conducted the evaluation. This teacher used the same grading scale specifically designed for evaluating student performance in the 2020 examination, ensuring an unbiased and rigorous assessment process.</p>
        <p>Each question is assigned 1 point. For MCQs, the grading scale was as follows: an incorrect response concealed a correct answer. The assigned grades were 0, 1, or 0.5, based on the nature of the answer provided. SOEQs were assessed as follows: 1 point is awarded for a correct response, 0 points for an incorrect response, and 0.5 points for an omission. For clarity, the global scores achieved by both third-year medical students, and ChatGPT were transformed into a score out of 100 (maximum score). To successfully pass the examination, candidates needed to achieve a global score of ≥50 points.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Illustration of the first version of the conversation with ChatGPT. The question was directly posed to ChatGPT without any context. ChatGPT responded by introducing the answer, emphasizing the necessity for a health care provider to address the case, and provide 4 probable diagnoses as demanded: sarcoidosis, idiopathic pulmonary fibrosis, mediastinal lymphoma, and atypical pulmonary infection. The bubbles in the figure represent the English translation of the conversations conducted in French.</p>
          </caption>
          <graphic xlink:href="mededu_v10i1e52818_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Illustration of the second version of the conversation with ChatGPT. The chatbot was asked the same question (in a separate chat session) but preceded by a briefing about the context of the question, which pertains to a pneumology examination for Tunisian medical students. A clear instruction on how to answer was provided. In light of this context, ChatGPT altered its response, explicitly stating pulmonary tuberculosis, likely due to the endemic nature of <italic>Mycobacterium tuberculosis</italic> in Tunisia and other African countries. The bubbles in the figure represent the English translation of the conversations conducted in French.</p>
          </caption>
          <graphic xlink:href="mededu_v10i1e52818_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <p>The collected data underwent statistical analysis using SPSS (version 25.0; IBM Corp). Nominal values were expressed as frequencies and percentages, while continuous variables were represented using means and SDs for normally distributed data and medians and quartiles for non-normally distributed data.</p>
        <p>A comparative analysis was conducted, evaluating student scores alongside those V1 and V2. This analysis encompassed various factors, including question formats (MCQs, SOEQs, and clinical scenarios), topics (clinical pneumology, microbiology, respiratory radiology, pharmacology, pathology, and thoracic surgery), and item performance indexes. To accurately portray the performance levels of each ChatGPT conversation, we presented results as percentages of the maximum scale attributed to each studied item, along with the ranking of ChatGPT scores among those of third-year medical students.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>We have obtained approval from both the Medical Education Committee and the Ethics Committee of the Faculty of Medicine of Tunis to access the data (file number CE-FMT/2024/04/FSI/V2). This approval ensures confidentiality and restricting external use.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Performance of Students in the Pneumology Examination</title>
        <p>The median overall score achieved by medical students in the pulmonology examination was 48.9 out of 100 (IQR 40.0-54.7; <xref ref-type="table" rid="table2">Table 2</xref>). Among the participants (N=244), a modest cohort of 107 students reached the necessary threshold for successful completion of the examination, resulting in an overall success rate of 43.9%.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Pneumology examination performance comparison: medical students versus ChatGPT with (V1) and without (V2) contextualization.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="120"/>
            <col width="90"/>
            <col width="130"/>
            <col width="70"/>
            <col width="0"/>
            <col width="100"/>
            <col width="140"/>
            <col width="0"/>
            <col width="70"/>
            <col width="100"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Parameters and categories</td>
                <td>Maximum category score</td>
                <td>Medical students’ performance</td>
                <td colspan="5">V1 performance</td>
                <td colspan="3">V2 performance</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Score, median (IQR)</td>
                <td colspan="2">Score</td>
                <td>Percentage score</td>
                <td>Rank among students (percentile)</td>
                <td colspan="2">Score</td>
                <td>Percentage score</td>
                <td>Rank among students (percentile)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="12">
                  <bold>Examination topics</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pathology</td>
                <td>3</td>
                <td>2.5 (2-3)</td>
                <td>2</td>
                <td colspan="2">66.7</td>
                <td>133 (40.6)</td>
                <td colspan="2">2.5</td>
                <td>83.3</td>
                <td>84 (62.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pharmacology</td>
                <td>5</td>
                <td>3.5 (2.5-4)</td>
                <td>3</td>
                <td colspan="2">60</td>
                <td>137 (38.8)</td>
                <td colspan="2">3.5</td>
                <td>70</td>
                <td>96 (57.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Microbiology</td>
                <td>2</td>
                <td>1.5 (1-1.5)</td>
                <td>1.5</td>
                <td colspan="2">75</td>
                <td>48 (78.6)</td>
                <td colspan="2">1.5</td>
                <td>75</td>
                <td>48 78.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Radiology</td>
                <td>7</td>
                <td>3.5 (2.1-4.5)</td>
                <td>3.5</td>
                <td colspan="2">50</td>
                <td>93 (58.5)</td>
                <td colspan="2">4</td>
                <td>57.1</td>
                <td>64 (71.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Thoracic surgery</td>
                <td>1</td>
                <td>0 (0-0)</td>
                <td>1</td>
                <td colspan="2">100</td>
                <td>1 (99.6)</td>
                <td colspan="2">0</td>
                <td>0</td>
                <td>29 (87.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clinical pneumology</td>
                <td>27</td>
                <td>11 (9-13)</td>
                <td>10</td>
                <td colspan="2">37</td>
                <td>133 (40.6)</td>
                <td colspan="2">11.5</td>
                <td>42.6</td>
                <td>97 (56.7)</td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Question formats</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Independent MCQs<sup>a</sup></td>
                <td>9</td>
                <td>4.5 (3.5-5.5)</td>
                <td>4</td>
                <td colspan="2">44.4</td>
                <td>138 (38.4)</td>
                <td colspan="2">3</td>
                <td>33.3</td>
                <td>191 (14.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Independent SOEQs<sup>b</sup></td>
                <td>13</td>
                <td>5 (3.5-6)</td>
                <td>4.5</td>
                <td colspan="2">34.6</td>
                <td>120 (46.4)</td>
                <td colspan="2">6.5</td>
                <td>50</td>
                <td>30 (86.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MCQ-structured clinical cases</td>
                <td>7</td>
                <td>2.8 (2-3.5)</td>
                <td>1.5</td>
                <td colspan="2">21.4</td>
                <td>181 (19.2)</td>
                <td colspan="2">2</td>
                <td>28.6</td>
                <td>149 (33.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SOEQ-structured clinical cases</td>
                <td>16</td>
                <td>9.5 (7.6-11)</td>
                <td>11</td>
                <td colspan="2">68.8</td>
                <td>51 (77.2)</td>
                <td colspan="2">11.5</td>
                <td>71.9</td>
                <td>36 (83.9)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Overall examination score</td>
                <td>100</td>
                <td>48.9 (40-54.4)</td>
                <td>46.7</td>
                <td colspan="2">46.7</td>
                <td>133 (40.6)</td>
                <td colspan="2">51.1</td>
                <td>51.1</td>
                <td>85 (62.1)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>MCQ: multiple choice question.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>SOEQ: short answer open-ended question.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Significant variations in performance emerged across different question categories. Notably, students (N=244) demonstrated pronounced proficiency in the domains of pathology, pharmacology, and microbiology, with scores exceeding 50% in 88.5% (n=216), 77.5% (n=189), and 74.6% (n=182), respectively. A moderate level of accomplishment was observed in the field of radiology. In contrast, the weakest performances were evident in questions related to thoracic surgery and clinical pneumology, with only 11.5% (n=28) and 22.5% (n=55) of students surpassing the 50% threshold of the maximum score in these areas.</p>
        <p>The question format also appeared to significantly influence students’ performance. Candidates (N=244) excelled in SOEQ-structured clinical cases and independent MCQs, with 68.9% (n=212) and 56.1% (n=137), respectively, achieving marks exceeding 50% of the maximum achievable. Conversely, the performance in MCQ-structured clinical cases lagged, with only 31.1% (n=76) of candidates reaching scores beyond 50% of the highest attainable mark for this question format. The most challenging performance was observed in independent SOEQs, as only 19.3% (n=47) of students achieved marks surpassing the 50% threshold of the maximum attainable for this particular question format.</p>
        <p>Based on these students’ outcomes, item performance indexes were computed. A significant proportion of questions (25/45, 56%) exhibited moderate difficulty indexes, while only 18% (8/45) of the questions demonstrated elevated levels of difficulty. Additionally, a substantial fraction of the items (21/45, 47%) showed limited discriminatory power in contrast to 24% (11/45) that displayed a pronounced D2 (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
      </sec>
      <sec>
        <title>Assessment of ChatGPT-3.5 Performance in the Pneumology Examination</title>
        <p>V1 performed well, achieving scores exceeding 50% in all question categories except for clinical pneumology. A similar trend emerged with V2, even though it faced challenges in reaching scores above 50% in thoracic surgery and clinical pneumology (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
        <p>The question format significantly impacted ChatGPT’s performance. In cases where questions lacked contextualization, V1 fell short of reaching the 50% mark for the maximum score in all question formats, except for SOEQ-structured clinical cases. Similarly, in the responses generated by V2, even when provided with appropriate context, limitations were evident in both independent MCQs and MCQs integrated into clinical cases. Interestingly, V2 demonstrated a higher level of accuracy in SOEQ-structured clinical cases. Both conversations displayed improved performance in questions with higher D1 and D2 (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <p>Considering the overall examination scores, V1 did not meet the passing threshold, achieving a total score of 46.7 out of 100. In contrast, V2 secured a global score of 51.5 out of 100, narrowly achieving success in this examination.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Achievement quotient of ChatGPT with (V1) and without (V2) contextualization in the pneumology examination by difficulty and discrimination indexes.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="480"/>
            <col width="270"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>V1 (%)</td>
                <td>V2 (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Low difficulty index terms</td>
                <td>20.8</td>
                <td>16.7</td>
              </tr>
              <tr valign="top">
                <td>Intermediate difficulty index terms</td>
                <td>54</td>
                <td>62</td>
              </tr>
              <tr valign="top">
                <td>High difficulty index terms</td>
                <td>62.5</td>
                <td>68</td>
              </tr>
              <tr valign="top">
                <td>Low discrimination index items</td>
                <td>31</td>
                <td>38.1</td>
              </tr>
              <tr valign="top">
                <td>Intermediate discrimination index items</td>
                <td>61.5</td>
                <td>61.5</td>
              </tr>
              <tr valign="top">
                <td>High discrimination index items</td>
                <td>59.1</td>
                <td>63.6</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparative Analysis of ChatGPT Performance and Medical Students’ Performance</title>
        <sec>
          <title>Question Topic</title>
          <p>Comparing the performance of ChatGPT with that of medical students, distinct patterns emerge. V1 demonstrated heightened proficiency in specialized pneumology fields, especially radiology, microbiology, and thoracic surgery. Notably, V1 outperformed 131 (58.5%), 176 (78.6%), and 223 (99.6%) medical students in these respective domains. ChatGPT faced challenges in this conversation when addressing questions related to pathology, pharmacology, and clinical pneumology, achieving lower scores than most medical students. In opposition to that, V2 consistently provided more accurate responses than the majority of medical students across various question categories, regardless of their specialized fields. Noteworthy excellence was observed, particularly in microbiology and thoracic surgery.</p>
        </sec>
        <sec>
          <title>Question Format</title>
          <p>V1 demonstrated strong proficiency in SOEQ-structured clinical cases, surpassing the performance of 173 (77.2%) medical students. However, its performance weakened in independent MCQs and SOEQs, and it performed less optimally in MCQ-structured clinical cases compared to third-year medical students. In the case of V2, commendable performance was observed in responding to both independent and structured SOEQs within clinical cases. Yet, a notable deficiency emerged in accurately answering all formats of MCQs, ranking only above 33 (14.7%) and 75 (33.5%) students in independent MCQs and MCQ-structured clinical cases, respectively.</p>
        </sec>
        <sec>
          <title>Item Performance Indexes</title>
          <p>Both conversations with ChatGPT, particularly V2, performed better than students in handling questions of low and intermediate difficulty. Remarkably, students demonstrated stronger proficiency when tackling highly difficult questions. Regarding the D2, V1 showed similar performance to participants in accurately addressing questions with low and high D2 index values. Additionally, V1 slightly exceeded participants’ performance in questions with an intermediate D2 index. V2 consistently outperformed medical students across all question discrimination categories (<xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
          <p>In summary, V1 did not pass the examination, but its score surpassed that of 91 (40.6%) students. In contrast, V2 successfully passed the examination, outperforming 139 (62.1%) medical students.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Percentile rank of ChatGPT with (ChatGPT-V1) and without (ChatGPT-V2) contextualization among medical students in the pneumology examination based on difficulty and discrimination indexes. Percentages represent the percentile rank of ChatGPT-V1 and ChatGPT-V2 among medical students.</p>
            </caption>
            <graphic xlink:href="mededu_v10i1e52818_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The cognitive capabilities and knowledge processing of ChatGPT have generated significant discussions in both public and academic circles. This NLP tool has gained attention for its prompt and coherent responses across various subjects, showcasing an impressive capacity to generate essays and offer explanations. However, there is a lack of comprehensive investigations into ChatGPT’s performance in medical education and examinations. To address this, this study evaluates ChatGPT using a previously collected data set of pneumology examinations from FMT, enabling direct comparisons between ChatGPT’s performance and that of third-year medical students.</p>
        <p>Our findings highlight ChatGPT’s proficiency in handling diverse biomedical information and clinical data. Powered by a vast corpus of internet text data, ChatGPT demonstrates remarkable expertise in pneumology, particularly excelling in radiology and microbiology. It outperformed a significant proportion of medical students in these paraclinical specialties. Comparable high performance of AI-powered tools in paraclinical sciences has been previously documented before. Rodriguez-Ruiz et al [<xref ref-type="bibr" rid="ref16">16</xref>], using data from 9 diverse data sets (2652 examinations), including 653 malignancies, found that their AI system exhibited cancer detection accuracy on par with the average breast radiologist, surpassing the performance of 61.4% of the radiologists in their retrospective analysis.</p>
        <p>Das et al [<xref ref-type="bibr" rid="ref17">17</xref>] assessed ChatGPT’s accuracy in addressing a test based on the competency-based medical education (CBME) curriculum for microbiology. ChatGPT showcased the ability to answer both first- and second-order knowledge questions related to microbiology. The model exhibited significant potential as an automated question-answering tool in the field of microbiology, achieving an accuracy rate of approximately 80%. In another investigation, ChatGPT demonstrated proficiency in medical biochemistry, another paraclinical specialty. It successfully responded to 200 random medical biochemistry reasoning questions from the CBME curriculum’s competency modules [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>In fields like clinical pneumology that demand careful processing of medical data, ChatGPT shows some limitations when compared to medical students. However, these shortcomings can be improved through adequate contextualization, as seen in the enhanced proficiency of V2. Our findings about clinical pneumology align with previous studies that highlight ChatGPT’s challenges in similar medical disciplines requiring advanced judgment and nuanced clinical reasoning, such as neurology and traumatology. For instance, ChatGPT 3.5 achieved an overall accuracy rate of 57%, just below the 58% passing threshold set for the 2022 UK Specialty Certificate Neurology Examination [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <p>Moreover, ChatGPT scored 35.8%, which is notably lower than the pass rate for the Fellowship of the Royal College of Surgeons examination in trauma surgery by 30%. This performance was also 8.2% below the average score of participants at all training levels [<xref ref-type="bibr" rid="ref19">19</xref>]. In a study conducted in India, ChatGPT demonstrated a limited ability to translate basic pharmacology knowledge into clear clinical concepts. It exhibited inconsistency in predicting and explaining common drug interactions [<xref ref-type="bibr" rid="ref20">20</xref>]. This observation aligns with ChatGPT’s modest accuracy in questions related to pharmacology applied to pneumology in our FMT examination.</p>
        <p>The way questions are presented greatly affects how well both medical students and AI tools like ChatGPT perform [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. ChatGPT struggled to match the performance of medical students in all question styles, except for SOEQs integrated into clinical scenarios. Even after contextualization, ChatGPT still had a hard time answering MCQs in pulmonology compared to medical students. Zhu et al [<xref ref-type="bibr" rid="ref23">23</xref>] addressed this concern, suggesting that ChatGPT may be more suitable for responding to open-ended questions than for being presented with a predefined set of options. Considering the ChatGPT’s occasional inconsistency in providing identical responses for the same question, the authors recommended posing the question 3 times to ensure response stability.</p>
        <p>Other research generally shows good performance by ChatGPT when handling MCQs. For example, a 2023 study by Duong and Solomon [<xref ref-type="bibr" rid="ref24">24</xref>] revealed ChatGPT’s comparable performance to human beings in responding to MCQs on human genetics. ChatGPT also successfully passed the 2022 Italian Residency Admission National Exam, which consists solely of MCQs. Additionally, in the 2022 European Examination in Core Cardiology, ChatGPT answered over 60% of questions correctly, displaying consistency across various MCQs [<xref ref-type="bibr" rid="ref25">25</xref>]. In this study, the discrepancy in ChatGPT’s performance across question formats may be attributed to the high difficulty level of these questions, even for third-year medical students.</p>
        <p>ChatGPT clearly outperformed medical students in tasks that required detailed responses, particularly SOEQs integrated into clinical scenarios. This was supported by Qu et al [<xref ref-type="bibr" rid="ref26">26</xref>], who also emphasized the impressive capability of this NLP software in handling otorhinolaryngology clinical scenarios [<xref ref-type="bibr" rid="ref26">26</xref>]. Indeed, ChatGPT consistently provided accurate differential diagnoses and well-justified treatment strategies for recognized clinical conditions. It used specialized medical terminology and carefully curated relevant medical history, physical examination, radiological, and laboratory findings. This proficiency can be explained by the similarity between the scenarios in our pneumology examination and the writing style commonly found in textbooks, scientific literature, and other data sources used to train the AI model.</p>
        <p>Unlike third-year medical students, ChatGPT surprisingly exhibited limited performance on questions with a high difficulty index. These questions necessitate skills in navigating intricate concepts, synthesizing information, and using strategic analytical abilities. Bhayana et al [<xref ref-type="bibr" rid="ref27">27</xref>] subjected this chatbot to the Canadian Royal College and American Board of Radiology examinations and their conclusions match our findings. Although ChatGPT successfully passed these examinations, it faced difficulties with questions demanding higher order thinking, such as describing radiological findings, classification, and application of concepts [<xref ref-type="bibr" rid="ref27">27</xref>]. While certain questions can help tell the difference between students with different levels of ability or knowledge, this D2 might not apply directly to AI-powered models like ChatGPT. A noteworthy observation is ChatGPT’s enhanced performance when provided with adequate context, outperforming students irrespective of the theoretical item discrimination.</p>
        <p>Ultimately, the findings reveal unexpected limits in ChatGPT’s performance during our pneumology examination. It barely passed in the part with contextualized chats, giving an overall modest score of 51.1%. This is different from past research where ChatGPT consistently demonstrates strong performance in English-language medical assessments like the United States Medical Licensing Examination, CBME evaluations, and the European Examination in Core Cardiology [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. It appears that its effectiveness diminishes when dealing with evaluations from non-Western institutions and non-English language examinations like our Tunisian examination, written in French. Similarly, this AI chatbot faced challenges in both the Taiwanese pharmacist licensing and Taiwanese family medicine board examinations [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. It also scored below the level of students in a Korean parasitology examination, the Japanese National Medical Licensing Examination, and the Chinese National Medical Licensing Examination [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. This discrepancy likely arises from ChatGPT’s limited ability to grasp linguistic nuances in non-English texts, exacerbated by the prevalence of Western-centric internet data. In certain contexts, these data may not fully apply to African and Asian populations, which exhibit slight variations in clinical presentations and disease epidemiology.</p>
      </sec>
      <sec>
        <title>Strengths and Limitations of the Study</title>
        <p>Our research constitutes the initial exploration of ChatGPT’s capabilities in French-language medical examinations, providing a valuable addition to the expanding body of research in medical AI assessment. A notable strength of this study lies in its comparative approach, effectively evaluating ChatGPT’s performance alongside that of medical students in a comprehensive pneumology examination. This examination covers various question formats and topics, offering a realistic assessment of the AI’s competencies.</p>
        <p>However, the study acknowledges several limitations. Conducted at a single institution with a highly homogeneous population concerning demographics, educational background, and medical curricula, there may be a potential selection bias that affects the external validity of the findings, particularly when extrapolating to more diverse student groups, even from other French-speaking medical universities. Additionally, focusing solely on the pneumology field may limit the generalizability of the findings to a broader academic context.</p>
        <p>ChatGPT’s inability to process visual elements also introduces an inherent selection bias concerning the administered questions, hindering a comprehensive evaluation of its proficiency in clinical scenarios where visual cues, radiology data, and histological images are significant. It is crucial to recognize that the specific findings related to ChatGPT-3.5 may not necessarily extend to other iterations of ChatGPT or alternative AI models. Furthermore, the absence of cultural adaptation and the scarcity of relevant data for non-Western contexts impeded a thorough exploration of ChatGPT’s capabilities, potentially introducing a cultural bias.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In summary, despite its access to a comprehensive web-based data set and quick response generation, ChatGPT performs similarly to an average medical student, with outcomes influenced by question format, item complexity, and contextual factors. Notably, ChatGPT struggles in specific medical contexts requiring information synthesis, advanced analytical skills, and nuanced clinical judgment. Its efficiency also diminishes in non–English language assessments and when confronted with data outside dominant internet sources. These findings suggest the need for further exploration and improvement in the application of AI tools like ChatGPT in medical education, training, and evaluation. It also emphasizes the importance of enhancing its performance across cultural and linguistic contexts.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CBME</term>
          <def>
            <p>competency-based medical education</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">D1</term>
          <def>
            <p>difficulty index</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">D2</term>
          <def>
            <p>discrimination index</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FMT</term>
          <def>
            <p>Faculty of Medicine of Tunis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MCQ</term>
          <def>
            <p>multiple choice question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SOEQ</term>
          <def>
            <p>short open-ended question</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We extend our deep appreciation to Dr Toujani Sonia for his invaluable assistance in impartially and objectively evaluating the responses generated by V1 and V2, thereby ensuring an unbiased and rigorous assessment process. All authors declared that they had insufficient or no funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee (APF) support for the publication of this article.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated and analyzed during this study are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
      <fn fn-type="con">
        <p>HC and CM conceived the study and designed the methodology. HC and AMM conducted the literature review. HC and CM engaged in conversations with ChatGPT. HC and CM performed data collection and statistical analysis. HC, CM, and AMM jointly drafted the manuscript. SM and BD supervised the research progression. All authors approved the final version of the manuscript.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Ohno-Machado</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing: an introduction</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>544</fpage>
          <lpage>551</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21846786"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000464</pub-id>
          <pub-id pub-id-type="medline">21846786</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2011-000464</pub-id>
          <pub-id pub-id-type="pmcid">PMC3168328</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirschberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
          </person-group>
          <article-title>Advances in natural language processing</article-title>
          <source>Science</source>
          <year>2015</year>
          <month>07</month>
          <day>17</day>
          <volume>349</volume>
          <issue>6245</issue>
          <fpage>261</fpage>
          <lpage>266</lpage>
          <pub-id pub-id-type="doi">10.1126/science.aaa8685</pub-id>
          <pub-id pub-id-type="medline">26185244</pub-id>
          <pub-id pub-id-type="pii">349/6245/261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The benefits and challenges of ChatGPT: an overview</article-title>
          <source>Front Comput Intell Syst</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>81</fpage>
          <lpage>83</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://drpress.org/ojs/index.php/fcis/article/view/4465"/>
          </comment>
          <pub-id pub-id-type="doi">10.54097/fcis.v2i2.4465</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing ChatGPT</article-title>
          <source>OpenAi</source>
          <access-date>2023-08-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maheshwari</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Top AI statistics and trends</article-title>
          <source>Forbes Advisor INDIA</source>
          <year>2023</year>
          <access-date>2023-08-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.forbes.com/advisor/in/business/ai-statistics/">https://www.forbes.com/advisor/in/business/ai-statistics/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT sets record for fastest-growing user base—analyst note</article-title>
          <source>Reuters</source>
          <year>2023</year>
          <access-date>2023-08-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.reuters.com/technology/chatgpt-sets-record-fastest-growing-user-base-analyst-note-2023-02-01/">https://www.reuters.com/technology/chatgpt-sets-record-fastest-growing-user-base-analyst-note-2023-02-01/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biswas</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>Role of chat GPT in public health</article-title>
          <source>Ann Biomed Eng</source>
          <year>2023</year>
          <volume>51</volume>
          <issue>5</issue>
          <fpage>868</fpage>
          <lpage>869</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03172-7</pub-id>
          <pub-id pub-id-type="medline">36920578</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03172-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bir</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluating ChatGPT's ability to solve higher-order questions on the competency-based medical education curriculum in medical biochemistry</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>04</month>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>e37023</fpage>
          <lpage>e37066</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37143631"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.37023</pub-id>
          <pub-id pub-id-type="pmcid">PMC10152308</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <month>03</month>
          <day>19</day>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>887</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11060887"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
          <pub-id pub-id-type="medline">36981544</pub-id>
          <pub-id pub-id-type="pii">healthcare11060887</pub-id>
          <pub-id pub-id-type="pmcid">PMC10048148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dave</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Athaluri</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations</article-title>
          <source>Front Artif Intell</source>
          <year>2023</year>
          <volume>6</volume>
          <fpage>1169595</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37215063"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/frai.2023.1169595</pub-id>
          <pub-id pub-id-type="medline">37215063</pub-id>
          <pub-id pub-id-type="pmcid">PMC10192861</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Downing</surname>
              <given-names>SM</given-names>
            </name>
          </person-group>
          <article-title>Item response theory: applications of modern test theory in medical education</article-title>
          <source>Med Educ</source>
          <year>2003</year>
          <month>08</month>
          <volume>37</volume>
          <issue>8</issue>
          <fpage>739</fpage>
          <lpage>745</lpage>
          <pub-id pub-id-type="doi">10.1046/j.1365-2923.2003.01587.x</pub-id>
          <pub-id pub-id-type="medline">12945568</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>The value of item response theory in clinical assessment: a review</article-title>
          <source>Assessment</source>
          <year>2011</year>
          <month>09</month>
          <volume>18</volume>
          <issue>3</issue>
          <fpage>291</fpage>
          <lpage>307</lpage>
          <pub-id pub-id-type="doi">10.1177/1073191110374797</pub-id>
          <pub-id pub-id-type="medline">20644081</pub-id>
          <pub-id pub-id-type="pii">1073191110374797</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Engelhardt</surname>
              <given-names>PV</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Henderson</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Harper</surname>
              <given-names>KA</given-names>
            </name>
          </person-group>
          <article-title>An introduction to classical test theory as applied to conceptual multiple-choice tests</article-title>
          <source>Getting Started in PER</source>
          <year>2009</year>
          <month>04</month>
          <publisher-loc>College Park, TX</publisher-loc>
          <publisher-name>American Association of Physics Teachers</publisher-name>
          <fpage>1</fpage>
          <lpage>40</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>De Champlain</surname>
              <given-names>AF</given-names>
            </name>
          </person-group>
          <article-title>A primer on classical test theory and item response theory for assessments in medical education</article-title>
          <source>Med Educ</source>
          <year>2010</year>
          <month>01</month>
          <volume>44</volume>
          <issue>1</issue>
          <fpage>109</fpage>
          <lpage>117</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://asmepublications.onlinelibrary.wiley.com/doi/10.1111/j.1365-2923.2009.03425.x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/j.1365-2923.2009.03425.x</pub-id>
          <pub-id pub-id-type="medline">20078762</pub-id>
          <pub-id pub-id-type="pii">MED3425</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rodriguez-Ruiz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lång</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gubern-Merida</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Broeders</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gennaro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Clauser</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Helbich</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Chevalier</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mertelmeier</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wallis</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Andersson</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zackrisson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Sechopoulos</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Stand-alone artificial intelligence for breast cancer detection in mammography: comparison with 101 radiologists</article-title>
          <source>J Natl Cancer Inst</source>
          <year>2019</year>
          <month>09</month>
          <day>01</day>
          <volume>111</volume>
          <issue>9</issue>
          <fpage>916</fpage>
          <lpage>922</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30834436"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jnci/djy222</pub-id>
          <pub-id pub-id-type="medline">30834436</pub-id>
          <pub-id pub-id-type="pii">5307077</pub-id>
          <pub-id pub-id-type="pmcid">PMC6748773</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Das</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Longjam</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sinha</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Deb Roy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Assessing the capability of ChatGPT in answering first- and second-order knowledge questions on microbiology as per competency-based medical education curriculum</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>03</month>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>e36034</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37056538"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.36034</pub-id>
          <pub-id pub-id-type="medline">37056538</pub-id>
          <pub-id pub-id-type="pmcid">PMC10086829</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannos</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the limits of AI in medical specialisation: ChatGPT's performance on the UK Neurology Specialty Certificate Examination</article-title>
          <source>BMJ Neurol Open</source>
          <year>2023</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>e000451</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37337531"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjno-2023-000451</pub-id>
          <pub-id pub-id-type="medline">37337531</pub-id>
          <pub-id pub-id-type="pii">bmjno-2023-000451</pub-id>
          <pub-id pub-id-type="pmcid">PMC10277081</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cuthbert</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Simpson</surname>
              <given-names>AI</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in orthopaedics: can Chat Generative Pre-trained Transformer (ChatGPT) pass Section 1 of the Fellowship of the Royal College of Surgeons (Trauma and Orthopaedics) examination?</article-title>
          <source>Postgrad Med J</source>
          <year>2023</year>
          <month>09</month>
          <day>21</day>
          <volume>99</volume>
          <issue>1176</issue>
          <fpage>1110</fpage>
          <lpage>1114</lpage>
          <pub-id pub-id-type="doi">10.1093/postmj/qgad053</pub-id>
          <pub-id pub-id-type="medline">37410674</pub-id>
          <pub-id pub-id-type="pii">7220358</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Juhi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pipil</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Santra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Behera</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The capability of ChatGPT in predicting and explaining common drug-drug interactions</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>03</month>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>e36272</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37073184"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.36272</pub-id>
          <pub-id pub-id-type="medline">37073184</pub-id>
          <pub-id pub-id-type="pmcid">PMC10105894</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Medina</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>Relationship between case question prompt format and the quality of responses</article-title>
          <source>Am J Pharm Educ</source>
          <year>2010</year>
          <month>03</month>
          <day>10</day>
          <volume>74</volume>
          <issue>2</issue>
          <fpage>29</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20414442"/>
          </comment>
          <pub-id pub-id-type="doi">10.5688/aj740229</pub-id>
          <pub-id pub-id-type="medline">20414442</pub-id>
          <pub-id pub-id-type="pii">S0002-9459(23)03534-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC2856418</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hift</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Should essays and other "open-ended"-type questions retain a place in written summative assessment in clinical medicine?</article-title>
          <source>BMC Med Educ</source>
          <year>2014</year>
          <month>11</month>
          <day>28</day>
          <volume>14</volume>
          <fpage>249</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-014-0249-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-014-0249-2</pub-id>
          <pub-id pub-id-type="medline">25431359</pub-id>
          <pub-id pub-id-type="pii">s12909-014-0249-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC4275935</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT can pass the AHA exams: Open-ended questions outperform multiple-choice format</article-title>
          <source>Resuscitation</source>
          <year>2023</year>
          <month>07</month>
          <volume>188</volume>
          <fpage>109783</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.resuscitationjournal.com/article/S0300-9572(23)00096-5/fulltext"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109783</pub-id>
          <pub-id pub-id-type="medline">37349064</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00096-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Duong</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Solomon</surname>
              <given-names>BD</given-names>
            </name>
          </person-group>
          <article-title>Analysis of large-language model versus human performance for genetics questions</article-title>
          <source>Eur J Hum Genet</source>
          <year>2023</year>
          <month>05</month>
          <day>29</day>
          <fpage>466</fpage>
          <lpage>468</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s41431-023-01396-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41431-023-01396-8</pub-id>
          <pub-id pub-id-type="medline">37246194</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41431-023-01396-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skalidis</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Cagnina</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luangphiphat</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mahendiran</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Muller</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Abbe</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fournier</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT takes on the European Exam in Core Cardiology: an artificial intelligence success story?</article-title>
          <source>Eur Heart J Digit Health</source>
          <year>2023</year>
          <month>05</month>
          <volume>4</volume>
          <issue>3</issue>
          <fpage>279</fpage>
          <lpage>281</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37265864"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ehjdh/ztad029</pub-id>
          <pub-id pub-id-type="medline">37265864</pub-id>
          <pub-id pub-id-type="pii">ztad029</pub-id>
          <pub-id pub-id-type="pmcid">PMC10232281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Qureshi</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Petersen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic and management applications of ChatGPT in structured otolaryngology clinical scenarios</article-title>
          <source>OTO Open</source>
          <year>2023</year>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>e67</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1002/oto2.67?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/oto2.67</pub-id>
          <pub-id pub-id-type="medline">37614494</pub-id>
          <pub-id pub-id-type="pii">OTO267</pub-id>
          <pub-id pub-id-type="pmcid">PMC10442607</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on a radiology board-style examination: insights into current strengths and limitations</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>06</month>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230582</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id>
          <pub-id pub-id-type="medline">37191485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>YM</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>TJ</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on the pharmacist licensing examination in Taiwan</article-title>
          <source>J Chin Med Assoc</source>
          <year>2023</year>
          <month>07</month>
          <day>01</day>
          <volume>86</volume>
          <issue>7</issue>
          <fpage>653</fpage>
          <lpage>658</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.lww.com/jcma/fulltext/2023/07000/performance_of_chatgpt_on_the_pharmacist_licensing.7.aspx"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JCMA.0000000000000942</pub-id>
          <pub-id pub-id-type="medline">37227901</pub-id>
          <pub-id pub-id-type="pii">02118582-202307000-00007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>YM</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hwang</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT failed Taiwan's Family Medicine Board Exam</article-title>
          <source>J Chin Med Assoc</source>
          <year>2023</year>
          <month>08</month>
          <day>01</day>
          <volume>86</volume>
          <issue>8</issue>
          <fpage>762</fpage>
          <lpage>766</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.lww.com/jcma/fulltext/2023/08000/chatgpt_failed_taiwan_s_family_medicine_board_exam.9.aspx"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JCMA.0000000000000946</pub-id>
          <pub-id pub-id-type="medline">37294147</pub-id>
          <pub-id pub-id-type="pii">02118582-990000000-00224</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title>
          <source>J Med Syst</source>
          <year>2023</year>
          <month>08</month>
          <day>15</day>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id>
          <pub-id pub-id-type="medline">37581690</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-023-01961-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takagi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Watari</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Erabi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sakaguchi</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: comparison study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>06</month>
          <day>29</day>
          <volume>9</volume>
          <fpage>e48002</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48002/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48002</pub-id>
          <pub-id pub-id-type="medline">37384388</pub-id>
          <pub-id pub-id-type="pii">v9i1e48002</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365615</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
