<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i1e50965</article-id>
      <article-id pub-id-type="pmid">38329802</article-id>
      <article-id pub-id-type="doi">10.2196/50965</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Comparison of the Performance of GPT-3.5 and GPT-4 With That of Medical Students on the Written German Medical Licensing Examination: Observational Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Venkatesh</surname>
            <given-names>Kaushik</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Thirunavukarasu</surname>
            <given-names>Arun</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Alshawaf</surname>
            <given-names>Hamza</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Brown</surname>
            <given-names>Martin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>XiaoYang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Albalawi</surname>
            <given-names>Ibrahim</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Meyer</surname>
            <given-names>Annika</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Institute for Clinical Chemistry</institution>
            <institution>University Hospital Cologne</institution>
            <addr-line>Kerpener Str 62</addr-line>
            <addr-line>Cologne, 50937</addr-line>
            <country>Germany</country>
            <email>annika.meyer1@uk-koeln.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8411-8799</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Riese</surname>
            <given-names>Janik</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0701-060X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Streichert</surname>
            <given-names>Thomas</given-names>
          </name>
          <degrees>Prof Dr</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6588-720X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Institute for Clinical Chemistry</institution>
        <institution>University Hospital Cologne</institution>
        <addr-line>Cologne</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of General Surgery, Visceral, Thoracic and Vascular Surgery</institution>
        <institution>University Hospital Greifswald</institution>
        <addr-line>Greifswald</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Annika Meyer <email>annika.meyer1@uk-koeln.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>8</day>
        <month>2</month>
        <year>2024</year>
      </pub-date>
      <volume>10</volume>
      <elocation-id>e50965</elocation-id>
      <history>
        <date date-type="received">
          <day>18</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>8</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>14</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>11</day>
          <month>12</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Annika Meyer, Janik Riese, Thomas Streichert. Originally published in JMIR Medical Education (https://mededu.jmir.org), 08.02.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2024/1/e50965" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The potential of artificial intelligence (AI)–based large language models, such as ChatGPT, has gained significant attention in the medical field. This enthusiasm is driven not only by recent breakthroughs and improved accessibility, but also by the prospect of democratizing medical knowledge and promoting equitable health care. However, the performance of ChatGPT is substantially influenced by the input language, and given the growing public trust in this AI tool compared to that in traditional sources of information, investigating its medical accuracy across different languages is of particular importance.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to compare the performance of GPT-3.5 and GPT-4 with that of medical students on the written German medical licensing examination.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>To assess GPT-3.5’s and GPT-4's medical proficiency, we used 937 original multiple-choice questions from 3 written German medical licensing examinations in October 2021, April 2022, and October 2022.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>GPT-4 achieved an average score of 85% and ranked in the 92.8th, 99.5th, and 92.6th percentiles among medical students who took the same examinations in October 2021, April 2022, and October 2022, respectively. This represents a substantial improvement of 27% compared to GPT-3.5, which only passed 1 out of the 3 examinations. While GPT-3.5 performed well in psychiatry questions, GPT-4 exhibited strengths in internal medicine and surgery but showed weakness in academic research.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The study results highlight ChatGPT’s remarkable improvement from moderate (GPT-3.5) to high competency (GPT-4) in answering medical licensing examination questions in German. While GPT-4’s predecessor (GPT-3.5) was imprecise and inconsistent, it demonstrates considerable potential to improve medical education and patient care, provided that medically trained users critically evaluate its results. As the replacement of search engines by AI tools seems possible in the future, further studies with nonprofessional questions are needed to assess the safety and accuracy of ChatGPT for the general population.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>large language model</kwd>
        <kwd>medical exams</kwd>
        <kwd>medical examinations</kwd>
        <kwd>medical education</kwd>
        <kwd>LLM</kwd>
        <kwd>public trust</kwd>
        <kwd>trust</kwd>
        <kwd>medical accuracy</kwd>
        <kwd>licensing exam</kwd>
        <kwd>licensing examination</kwd>
        <kwd>improvement</kwd>
        <kwd>patient care</kwd>
        <kwd>general population</kwd>
        <kwd>licensure examination</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Rapid advancements in large language models (LLMs) have sparked considerable excitement regarding their potential applications in the medical field [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. One LLM-based application that has garnered worldwide attention is ChatGPT, developed by the research and deployment company OpenAI, due to its easy accessibility and potential to democratize knowledge [<xref ref-type="bibr" rid="ref3">3</xref>]. The freely available version is based on the artificial intelligence (AI)–based tool GPT-3.5, which encompasses billions of parameters and has been trained on approximately 570 GB of text from the internet [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>ChatGPT’s GPT-3.5 iteration has already shown promise in several routine medical tasks and medical research [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>], even raising ethical concerns in the literature [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. The prompt and interactive nature of this AI’s responses might even revolutionize search engines, while also revealing shortcomings in medical education [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, despite the introduction of the more advanced iteration GPT-4, concerns about the lack of transparency regarding this AI’s model parameters, training process, and underlying data structure remain unaddressed [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. These concerns cast doubt on the medical proficiency of these LLMs, as both were not primarily trained on medical data and are the first to admit that as a language AI model, passing a medical examination is outside their skillset (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Still, with assistance and adaptations, GPT-3.5 nearly passed the United States Medical Licensing Examination [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], and GPT-4 passed a Japanese medical examination [<xref ref-type="bibr" rid="ref15">15</xref>]. Considering the variable performance of multilingual LLMs across different input languages [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], it is imperative to evaluate these models in various other linguistic contexts as well as on large data sets of original medical examination questions.</p>
      <p>The primary objective of this study is to evaluate the medical proficiency of both ChatGPT iterations (GPT-3.5 and -4) in comparison to medical students by testing it on 937 original questions from the written German medical licensing examination (Zweites Staatsexamen), providing further data for a possible future integration. While the German medical licensing examination covers various medical subdisciplines in 320 multiple-choice questions [<xref ref-type="bibr" rid="ref18">18</xref>], it has a high interexamination reliability of over 0.9 [<xref ref-type="bibr" rid="ref19">19</xref>]. Despite using the same third-party client for question retrieval as earlier studies, the German approach of publicly releasing the examination questions enables the third-party client to guarantee the originality of the test items derived directly from the examination itself [<xref ref-type="bibr" rid="ref20">20</xref>]. Additionally, to the best of our knowledge, we have tested both ChatGPT versions on the largest data set of medical licensing examination questions not included in their training data set. Furthermore, we did not exclude all image-based questions a priori. Instead, we evaluated the relevance of the images for each question and compared the results both with and without images.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>To ensure that any observed performance was not influenced by changes in ChatGPT’s training data, we specifically chose the 3 most recent examinations (October 2021, April 2022, and October 2022) after the AI’s knowledge cutoff date [<xref ref-type="bibr" rid="ref17">17</xref>]. Thus, we were able to obtain 937 multiple-choice questions, each with 5 possible answers from the third-party client Amboss, a web-based learning platform that provides the original questions from the Institut für Medizinische und Pharmazeutische Prüfungsfragen (IMPP). To maintain the original examination format, we presented all obtained questions and answer options in the same order as they appeared in the examination. No specific training code was used while submitting the questions. Due to AI’s inability to analyze visual content, answerability based on question text alone was defined as the primary inclusion criterion, resulting in the exclusion of 102 questions. The questions were submitted through ChatGPT’s interface of the GPT-3.5 (January 30, 2023) and GPT-4 (March 14, 2023) versions. ChatGPT’s answers were then compared to the official correct answers and evaluated. If ChatGPT selected none or more than 1 of the multiple-choice answers, the question was repeated in its original format up to 4 times or until a conclusive response could be obtained from ChatGPT (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <p>We recorded additional data, such as answer length, content warnings, and recommendations for further diagnosis, and categorized the questioning methodology. To assess the readability of a question, we used the Simple Measure of Gobbledygook (SMOG) as it has shown acceptable interrater reliability for patient education materials in the literature [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>Examination statistics provided by the “MEDI-LEARN” portal were also used, including the number of correct student answers and the specialization of each question. The “Blueprint” published by the IMPP outlines the distribution of subspecialties within the written state examinations [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Flowchart of the study design for the evaluation of ChatGPT’s (GPT-3.5 and GPT-3) accuracy in the written German medical licensing examination (2021-2022). The flowchart presents the criteria for question selection, including both the inclusion and exclusion criteria.</p>
          </caption>
          <graphic xlink:href="mededu_v10i1e50965_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>To perform our data analysis, we used several packages [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref37">37</xref>] in addition to the R programming language [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
        <p>While continuous variables were reported as arithmetic mean (SD) values, categorical variables were reported as frequencies and percentages. The Kolmogorov-Smirnov test, Shapiro-Wilk test, and QQ plots were used to confirm the normal distribution of continuous data statistically and graphically. To determine significant differences, we used unpaired <italic>t</italic> test or ANOVA for continuous variables and chi-square test or Wilcoxon rank-sum test for categorical variables. <italic>P</italic> values of &#60;.05 were deemed significant. Univariate and multivariate regression analyses were additionally performed to provide information on probabilities and predictors.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Ethics approval was not required as data were collected from publicly available sources on the internet or were generated using AI-based methods. No personally identifiable information was used in the data collection, and all data were handled in accordance with applicable data privacy laws and regulations.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Overall, GPT-4 demonstrated superior performance with an average score of 796 out of 937 (85%), surpassing GPT-3.5’s score of 548 out of 937 (58%), which previously fell below the general passing threshold of 60% (<xref rid="figure2" ref-type="fig">Figure 2</xref>A) [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. For the April 2022 examination, GPT-3.5 and GPT-4 achieved their highest scores (GPT-3.5: 195/319, 61%; GPT-4: 287/315, 91%), while the proportion of students who answered correctly remained constant across the 3 examinations (mean 76%, SD 18%; <italic>P</italic>=.86; <xref rid="figure2" ref-type="fig">Figure 2</xref>B and <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
      <p>Thus, GPT-4 passed all tested examinations, whereas GPT-3.5 could only pass 1 of the 3 examinations. Although the examinations varied in several aspects, we also observed a significant difference in the number of images (<italic>P</italic>=.02; <xref rid="figure2" ref-type="fig">Figure 2</xref>C and <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). As GPT-3.5 and GPT-4 could, at the time of the study, not process these, we further investigated the potential image-related discrepancy between the examinations by excluding from subsequent analyses any questions that required image-dependent responses. The exclusion of these questions did not significantly alter examination difficulty, as evidenced by similar student scores (<xref rid="figure2" ref-type="fig">Figure 2</xref>D).</p>
      <p>Moreover, no differences were observed in the parameters collected on student accuracy, questions, or answer characteristics in relation to the performance of GPT-4 and GPT-3.5 in the excluded cases (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Upon excluding image-based questions, GPT-4 continued to outperform GPT-3.5, with scores approaching 91.44%. However, GPT-3.5 exceeded expectations by achieving passing scores on all 3 examinations (October 2021: 60.22%; April 2022: 63.36%; October 2022: 60.07%; <xref rid="figure2" ref-type="fig">Figure 2</xref>E and <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). GPT-3.5’s accuracy (<italic>P</italic>=.66), the number of images (<italic>P</italic>=.07), and students’ accuracy (<italic>P</italic>=.77) remained constant throughout the examinations, whereas GPT-4’s accuracy (<italic>P</italic>=.02), the specialties (<italic>P</italic>&#60;.001), and question type (<italic>P</italic>=.04) varied (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> and <xref rid="figure2" ref-type="fig">Figures 2</xref>A, 2B, and 2E). The details of the included questions and their respective categorizations are provided in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>Bar plots of ChatGPT’s (GPT-3.5 and GPT-4) and box plots of students’ accuracy in the written German medical licensing examination (2021-2022). Bar graphs and box plots of (A) the relative number of correct answers provided by ChatGPT (GPT-3.5 and GPT-4) answers, (B) correct answers provided by students, (C) and image-based questions for the different examinations. (D and E) The relative number of correct answers by ChatGPT (GPT-3.5 and GPT-4) and students, comparing all questions with the included text-based questions. The 60% pass mark is presented as a red line in (A) and (E) to provide context for the performance of ChatGPT (GPT-3.5 and GPT-4). In addition, (E) displays the percentile achieved by ChatGPT (GPT-3.5 and GPT-4) for each year's examination, based on the percentile limits published by the Institut für Medizinische und Pharmazeutische Prüfungsfragen [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>].</p>
        </caption>
        <graphic xlink:href="mededu_v10i1e50965_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Summary statistics for ChatGPT's (GPT-3.5 and GPT-4) accuracy during the written German medical licensing examination, 2021-2022.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="270"/>
          <col width="100"/>
          <col width="100"/>
          <col width="100"/>
          <col width="0"/>
          <col width="100"/>
          <col width="0"/>
          <col width="100"/>
          <col width="0"/>
          <col width="100"/>
          <col width="0"/>
          <col width="100"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Characteristic</td>
              <td>Overall (N=834)</td>
              <td colspan="5">Accuracy of GPT-3.5</td>
              <td colspan="5">Accuracy of GPT-4</td>
            </tr>
            <tr valign="top">
              <td colspan="2"> </td>
              <td> </td>
              <td>False (n=323)</td>
              <td>True (n=511)</td>
              <td colspan="2"><italic>P</italic> value</td>
              <td colspan="2">False (n=105)</td>
              <td colspan="2">True (n=729)</td>
              <td colspan="2"><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="2">Students' correct response rate (%), mean (SD)</td>
              <td>77 (18)</td>
              <td>71 (18)</td>
              <td>80 (16)</td>
              <td colspan="2">&#60;.001<sup>a</sup></td>
              <td colspan="2">70 (18)</td>
              <td colspan="2">78 (17)</td>
              <td colspan="2">&#60;.001<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="2">Accuracy of GPT-3.5, n (%)</td>
              <td>511 (61)</td>
              <td>N/A<sup>b</sup></td>
              <td>N/A</td>
              <td colspan="2">N/A</td>
              <td colspan="2">38 (36)</td>
              <td colspan="2">473 (65)</td>
              <td colspan="2">&#60;.001<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="2">Accuracy of GPT-4, n (%)</td>
              <td>729 (87)</td>
              <td>256 (79)</td>
              <td>473 (93)</td>
              <td colspan="2">&#60;.001<sup>c</sup></td>
              <td colspan="2">N/A</td>
              <td colspan="2">N/A</td>
              <td colspan="2">N/A</td>
            </tr>
            <tr valign="top">
              <td colspan="2">Readability score of the question, mean (SD)</td>
              <td>14.96 (1.89)</td>
              <td>14.93 (1.87)</td>
              <td>14.98 (1.90)</td>
              <td colspan="2">.65<sup>a</sup></td>
              <td colspan="2">14.91 (2.26)</td>
              <td colspan="2">14.97 (1.84)</td>
              <td colspan="2">.21<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="6">
                <bold>Question type</bold>
                <bold>, n (%)</bold>
              </td>
              <td colspan="2">.76<sup>c</sup></td>
              <td colspan="2">N/A</td>
              <td colspan="2">N/A</td>
              <td>.009<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Connected (key feature)</td>
              <td>532 (64)</td>
              <td>204 (63)</td>
              <td>328 (64)</td>
              <td colspan="2"> </td>
              <td colspan="2">79 (75)</td>
              <td colspan="2">453 (62)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Single question</td>
              <td>302 (36)</td>
              <td>119 (37)</td>
              <td>183 (36)</td>
              <td colspan="2"> </td>
              <td colspan="2">26 (25)</td>
              <td colspan="2">276 (38)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Images referenced in questions</td>
              <td>84 (10)</td>
              <td>23 (7.1)</td>
              <td>61 (12)</td>
              <td colspan="2">.02<sup>c</sup></td>
              <td colspan="2">17 (16)</td>
              <td colspan="2">67 (9.2)</td>
              <td colspan="2">.03<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="6">
                <bold>Specialty, n (%)</bold>
              </td>
              <td colspan="2">.02<sup>c</sup></td>
              <td colspan="2">N/A</td>
              <td colspan="2">N/A</td>
              <td>.07<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Gynecology</td>
              <td>43 (5.2)</td>
              <td>12 (3.7)</td>
              <td>31 (6.1)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">7 (6.7)</td>
              <td colspan="2">36 (4.9)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Infectiology</td>
              <td>74 (8.9)</td>
              <td>24 (7.4)</td>
              <td>50 (9.8)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">6 (5.7)</td>
              <td colspan="2">68 (9.3)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Internal medicine</td>
              <td>176 (21)</td>
              <td>71 (22)</td>
              <td>105 (21)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">15 (14)</td>
              <td colspan="2">161 (22)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Neurology</td>
              <td>112 (13)</td>
              <td>51 (16)</td>
              <td>61 (12)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">12 (11)</td>
              <td colspan="2">100 (14)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Others</td>
              <td>269 (32)</td>
              <td>106 (33)</td>
              <td>163 (32)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">46 (44)</td>
              <td colspan="2">223 (31)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Pediatrics</td>
              <td>62 (7.4)</td>
              <td>26 (8.0)</td>
              <td>36 (7.0)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">11 (10)</td>
              <td colspan="2">51 (7.0)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Psychiatry</td>
              <td>54 (6.5)</td>
              <td>11 (3.4)</td>
              <td>43 (8.4)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">5 (4.8)</td>
              <td colspan="2">49 (6.7)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Surgery</td>
              <td>44 (5.3)</td>
              <td>22 (6.8)</td>
              <td>22 (4.3)</td>
              <td colspan="2"> </td>
              <td colspan="2">3 (2.9)</td>
              <td colspan="2">41 (5.6)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td colspan="6">
                <bold>Expertise, n (%)</bold>
              </td>
              <td colspan="2">.64<sup>c</sup></td>
              <td colspan="2">N/A</td>
              <td colspan="2">N/A</td>
              <td>.34<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Background knowledge</td>
              <td>103 (12)</td>
              <td>32 (9.9)</td>
              <td>71 (14)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">13 (12)</td>
              <td colspan="2">90 (12)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Complications</td>
              <td>49 (5.9)</td>
              <td>19 (5.9)</td>
              <td>30 (5.9)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">4 (3.8)</td>
              <td colspan="2">45 (6.2)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Diagnostic competence</td>
              <td>466 (56)</td>
              <td>184 (57)</td>
              <td>282 (55)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">54 (51)</td>
              <td colspan="2">412 (57)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Prevention competence</td>
              <td>36 (4.3)</td>
              <td>13 (4.0)</td>
              <td>23 (4.5)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">6 (5.7)</td>
              <td colspan="2">30 (4.1)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Scientific practice</td>
              <td>34 (4.1)</td>
              <td>14 (4.3)</td>
              <td>20 (3.9)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">8 (7.6)</td>
              <td colspan="2">26 (3.6)</td>
              <td colspan="2"> </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Therapeutic competence</td>
              <td>146 (18)</td>
              <td>61 (19)</td>
              <td>85 (17)</td>
              <td colspan="2">
                <break/>
              </td>
              <td colspan="2">20 (19)</td>
              <td colspan="2">126 (17)</td>
              <td colspan="2"> </td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>Wilcoxon rank-sum test.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>N/A: not applicable.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>Pearson chi-square test.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>After controlling for all other variables, correct student responses (GPT-3.5: OR 0.01, 95% CI 0.00-0.01, <italic>P</italic>&#60;.001; GPT-4: OR 0.00, 95% CI 0.00-0.00, <italic>P</italic>=.003) and questions with images (GPT-3.5: OR 0.19, 95% CI 0.08-0.30, <italic>P</italic>&#60;.001; GPT-4: OR –0.09, 95% CI –0.16 to –0.01, <italic>P</italic>=.02) emerged as significant predictors of GPT-3.5’s and GPT-4’s accuracy, regardless of the version. Furthermore, our analysis revealed that only questions pertaining to psychiatry were significant predictors of correct GPT-3.5 responses (OR 0.19, 95% CI 0.02-0.36, <italic>P</italic>=.03). In contrast, questions related to internal medicine (OR 0.10, 95% CI 0.00-0.19, <italic>P</italic>=.04) and surgery (OR 0.12, 95% CI 0.00-0.25, <italic>P</italic>=.049) were the only medical subspecialties significantly predicting accurate responses of GPT-4. Conversely, questions concerning scientific practice (OR –0.14, 95% CI –0.29 to 0.00, <italic>P</italic>=.05) were less likely to be answered correctly by GPT-4 (<xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure3" ref-type="fig">Figure 3</xref>). The question SMOG readability score, however, did not significantly impact ChatGPT’s accuracy.</p>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Regression analysis to compare ChatGPT's (GPT-3.5 and GPT-4) accuracy during the written German medical licensing examination (2021-2022; N=833).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="180"/>
          <col width="70"/>
          <col width="70"/>
          <col width="70"/>
          <col width="0"/>
          <col width="60"/>
          <col width="70"/>
          <col width="70"/>
          <col width="0"/>
          <col width="0"/>
          <col width="70"/>
          <col width="70"/>
          <col width="70"/>
          <col width="0"/>
          <col width="60"/>
          <col width="70"/>
          <col width="70"/>
          <thead>
            <tr valign="top">
              <td>Characteristic</td>
              <td colspan="9">GPT-3.5</td>
              <td colspan="7">GPT-4</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">Univariate</td>
              <td colspan="4">Multivariate</td>
              <td colspan="5">Univariate</td>
              <td colspan="3">Multivariate</td>
            </tr>
            <tr valign="top">
              <td> </td>
              <td>Odds ratio</td>
              <td>95% CI</td>
              <td><italic>P</italic> value</td>
              <td colspan="2">β</td>
              <td>95% CI</td>
              <td><italic>P</italic> value</td>
              <td colspan="3">Odds ratio</td>
              <td>95% CI</td>
              <td><italic>P</italic> value</td>
              <td colspan="2">β</td>
              <td>95% CI</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Students’ correct response rate</td>
              <td>1.03</td>
              <td>1.02 to 1.04</td>
              <td>&#60;.001</td>
              <td colspan="2">.01</td>
              <td>0.00 to 0.01</td>
              <td>&#60;.001</td>
              <td colspan="3">1.02</td>
              <td>1.01 to 1.03</td>
              <td>&#60;.001</td>
              <td colspan="2">.00</td>
              <td>0.00 to 0.00</td>
              <td>.003</td>
            </tr>
            <tr valign="top">
              <td>Accuracy of GPT-4</td>
              <td>3.25</td>
              <td>2.13 to 5.02</td>
              <td>&#60;.001</td>
              <td colspan="2">.26</td>
              <td>0.16 to 0.36</td>
              <td>&#60;.001</td>
              <td colspan="3">N/A<sup>a</sup> </td>
              <td>N/A</td>
              <td>N/A</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
            </tr>
            <tr valign="top">
              <td>Accuracy of GPT-3.5</td>
              <td>N/A</td>
              <td>N/A</td>
              <td>N/A</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
              <td colspan="3">3.25</td>
              <td>2.13 to 5.02</td>
              <td>&#60;.001</td>
              <td colspan="2">.12</td>
              <td>0.08 to 0.17</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>October 2021 examination</td>
              <td>0.94</td>
              <td>0.70 to 1.27</td>
              <td>.68</td>
              <td colspan="2">.00</td>
              <td>–0.08 to 0.08</td>
              <td>.94</td>
              <td colspan="3">0.90</td>
              <td>0.59 to 1.40</td>
              <td>.64</td>
              <td colspan="2">.02</td>
              <td>–0.04 to 0.07</td>
              <td>.55</td>
            </tr>
            <tr valign="top">
              <td>April 2022 examination</td>
              <td>1.15</td>
              <td>0.86 to 1.54</td>
              <td>.35</td>
              <td colspan="2">.03</td>
              <td>–0.05 to 0.11</td>
              <td>.47</td>
              <td colspan="3">1.85</td>
              <td>1.17 to 3.03</td>
              <td>.01</td>
              <td colspan="2">.06</td>
              <td>0.01 to 0.11</td>
              <td>.03</td>
            </tr>
            <tr valign="top">
              <td>October 2022 examination</td>
              <td>0.92</td>
              <td>0.69 to 1.24</td>
              <td>.59</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
              <td colspan="3">0.63</td>
              <td>0.42 to 0.96</td>
              <td>.03</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
            </tr>
            <tr valign="top">
              <td>Question type</td>
              <td>0.96</td>
              <td>0.72 to 1.28</td>
              <td>.78</td>
              <td colspan="2">–.03</td>
              <td>–0.10 to 0.04</td>
              <td>.39</td>
              <td colspan="3">1.86</td>
              <td>1.18 to 3.01</td>
              <td>.01</td>
              <td colspan="2">.06</td>
              <td>0.02 to 0.11</td>
              <td>.007</td>
            </tr>
            <tr valign="top">
              <td>Images referenced in questions</td>
              <td>1.77</td>
              <td>1.09 to 2.98</td>
              <td>.03</td>
              <td colspan="2">.19</td>
              <td>0.08 to 0.30</td>
              <td>&#60;.001</td>
              <td colspan="3">0.52</td>
              <td>0.30 to 0.96</td>
              <td>.03</td>
              <td colspan="2">–.09</td>
              <td>–0.16 to –0.01</td>
              <td>.02</td>
            </tr>
            <tr valign="top">
              <td>Other specialty</td>
              <td>0.96</td>
              <td>0.71 to 1.30</td>
              <td>.80</td>
              <td colspan="2">.00</td>
              <td>–0.13 to 0.14</td>
              <td>.94</td>
              <td colspan="3">0.57</td>
              <td>0.37 to 0.86</td>
              <td>.007</td>
              <td colspan="2">.02</td>
              <td>–0.07 to 0.11</td>
              <td>.73</td>
            </tr>
            <tr valign="top">
              <td>Gynecology and obstetrics</td>
              <td>1.62</td>
              <td>0.84 to 3.33</td>
              <td>.17</td>
              <td colspan="2">.12</td>
              <td>–0.06 to 0.31</td>
              <td>.19</td>
              <td colspan="3">0.71</td>
              <td>0.32 to 1.78</td>
              <td>.42</td>
              <td colspan="2">.01</td>
              <td>–0.12 to 0.14</td>
              <td>.88</td>
            </tr>
            <tr valign="top">
              <td>Surgery</td>
              <td>0.62</td>
              <td>0.33 to 1.14</td>
              <td>.12</td>
              <td colspan="2">–.12</td>
              <td>–0.30 to 0.06</td>
              <td>.18</td>
              <td colspan="3">2.03</td>
              <td>0.72 to 8.49</td>
              <td>.24</td>
              <td colspan="2">.12</td>
              <td>0.00 to 0.25</td>
              <td>.049</td>
            </tr>
            <tr valign="top">
              <td>Internal medicine</td>
              <td>0.92</td>
              <td>0.66 to 1.30</td>
              <td>.63</td>
              <td colspan="2">–.02</td>
              <td>–0.15 to 0.12</td>
              <td>.81</td>
              <td colspan="3">1.7</td>
              <td>0.99 to 3.14</td>
              <td>.07</td>
              <td colspan="2">.10</td>
              <td>0.00 to 0.19</td>
              <td>.043</td>
            </tr>
            <tr valign="top">
              <td>Infectious diseases</td>
              <td>1.35</td>
              <td>0.82 to 2.28</td>
              <td>.24</td>
              <td colspan="2">.06</td>
              <td>–0.10 to 0.22</td>
              <td>.48</td>
              <td colspan="3">1.7</td>
              <td>0.78 to 4.48</td>
              <td>.23</td>
              <td colspan="2">.09</td>
              <td>–0.02 to 0.20</td>
              <td>.11</td>
            </tr>
            <tr valign="top">
              <td>Psychiatry</td>
              <td>2.61</td>
              <td>1.37 to 5.40</td>
              <td>.005</td>
              <td colspan="2">.19</td>
              <td>0.02 to 0.36</td>
              <td>.03</td>
              <td colspan="3">1.44</td>
              <td>0.62 to 4.23</td>
              <td>.45</td>
              <td colspan="2">.03</td>
              <td>–0.09 to 0.15</td>
              <td>.61</td>
            </tr>
            <tr valign="top">
              <td>Neurology</td>
              <td>0.72</td>
              <td>0.49 to 1.08</td>
              <td>.12</td>
              <td colspan="2">–.04</td>
              <td>–0.18 to 0.11</td>
              <td>.61</td>
              <td colspan="3">1.23</td>
              <td>0.68 to 2.45</td>
              <td>.52</td>
              <td colspan="2">.08</td>
              <td>–0.02 to 0.18</td>
              <td>.11</td>
            </tr>
            <tr valign="top">
              <td>Pediatrics</td>
              <td>0.87</td>
              <td>0.52 to 1.48</td>
              <td>.60</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
              <td colspan="3">0.64</td>
              <td>0.34 to 1.34</td>
              <td>.21</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
            </tr>
            <tr valign="top">
              <td>Diagnostic competence</td>
              <td>0.93</td>
              <td>0.70 to 1.23</td>
              <td>.60</td>
              <td colspan="2">–.03</td>
              <td>–0.17 to 0.11</td>
              <td>.67</td>
              <td colspan="3">1.22</td>
              <td>0.81 to 1.85</td>
              <td>.33</td>
              <td colspan="2">–.05</td>
              <td>–0.14 to 0.05</td>
              <td>.34</td>
            </tr>
            <tr valign="top">
              <td>Therapeutic competence</td>
              <td>0.86</td>
              <td>0.60 to 1.24</td>
              <td>.41</td>
              <td colspan="2">–.04</td>
              <td>–0.19 to 0.12</td>
              <td>.65</td>
              <td colspan="3">0.89</td>
              <td>0.54 to 1.54</td>
              <td>.66</td>
              <td colspan="2">–.06</td>
              <td>–0.16 to 0.05</td>
              <td>.28</td>
            </tr>
            <tr valign="top">
              <td>Background knowledge</td>
              <td>1.47</td>
              <td>0.95 to 2.32</td>
              <td>.09</td>
              <td colspan="2">.08</td>
              <td>–0.09 to 0.24</td>
              <td>.36</td>
              <td colspan="3">1.00</td>
              <td>0.55 to 1.94</td>
              <td>&#62;.99</td>
              <td colspan="2">–.05</td>
              <td>–0.16 to 0.06</td>
              <td>.36</td>
            </tr>
            <tr valign="top">
              <td>Prevention competence</td>
              <td>1.13</td>
              <td>0.57 to 2.32</td>
              <td>.74</td>
              <td colspan="2">.00</td>
              <td>–0.20 to 0.20</td>
              <td>&#62;.99</td>
              <td colspan="3">0.71</td>
              <td>0.31 to 1.93</td>
              <td>.45</td>
              <td colspan="2">–.11</td>
              <td>–0.25 to 0.03</td>
              <td>.11</td>
            </tr>
            <tr valign="top">
              <td>Scientific practice</td>
              <td>0.90</td>
              <td>0.45 to 1.85</td>
              <td>.77</td>
              <td colspan="2">.01</td>
              <td>–0.20 to 0.22</td>
              <td>.95</td>
              <td colspan="3">0.45</td>
              <td>0.21 to 1.09</td>
              <td>.06</td>
              <td colspan="2">–.14</td>
              <td>–0.29 to 0.00</td>
              <td>.05</td>
            </tr>
            <tr valign="top">
              <td>Complications</td>
              <td>1.00</td>
              <td>0.56 to 1.84</td>
              <td>&#62;.99</td>
              <td colspan="2">N/A </td>
              <td>N/A</td>
              <td>N/A</td>
              <td colspan="3">1.66</td>
              <td>0.66 to 5.61</td>
              <td>.34</td>
              <td colspan="2">N/A</td>
              <td>N/A</td>
              <td>N/A</td>
            </tr>
            <tr valign="top">
              <td>Readability score of the question</td>
              <td>1.01</td>
              <td>0.94 to 1.09</td>
              <td>.70</td>
              <td colspan="2">.01</td>
              <td>–0.01 to 0.03</td>
              <td>.24</td>
              <td colspan="3">1.02</td>
              <td>0.91 to 1.14</td>
              <td>.76</td>
              <td colspan="2">.00</td>
              <td>–.01 to 0.01</td>
              <td>.98</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>N/A: not applicable.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <fig id="figure3" position="float">
        <label>Figure 3</label>
        <caption>
          <p>Comparison of ChatGPT's (GPT-3.5 and GPT-4) and students’ relative accuracy in relation to the tested specialties and methodology in the written German medical licensing examination (2021-2022). The bar graph displays the percentage of correct answers provided by ChatGPT (GPT-3.5 and GPT-4) and students in (A) each specialty and (B) and methodology, while the blue line demonstrates a 60% pass mark.</p>
        </caption>
        <graphic xlink:href="mededu_v10i1e50965_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>With the introduction of ChatGPT’s GPT-3.5 and GPT-4 iterations, the potential application for AI in research, patient care, and medical education is gaining recognition [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. By improving the users’ experience and facilitating more efficient information retrieval, ChatGPT might even revolutionize the future of search engines and shift the focus of medical education from memorization to practical application [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <p>Under this premise, the nearly passing scores of the freely available GPT-3.5 iteration, along with the exceptional scores of GPT-4, are highly relevant. Even with the varying scores of 51%-67% of GPT-3.5 across various input languages [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>], both models consistently outperform most prominent general and domain-specific LLMs, such as InstructGPT (53%), GPT-3 (25%), and BioMedLM (50%) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Despite these improvements, GPT-3.5’s or GPT-4’s performance still fell short in comparison to that of medical students in a Japanese medical examination according to the study by Takagi et al [<xref ref-type="bibr" rid="ref15">15</xref>]. In comparison to the German medical students, however, GPT-3.5 scored in the 8.6th percentile, while GPT-4 ranked in the 92.8th, 99.5th, and 92.6th percentiles in the October 2021, April 2022, and October 2022 examinations [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. The observed variations in the AI's accuracy across input languages may partially reflect the language composition of their data sets, as LLMs tend to favor languages that are more represented in their training data [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Since ChatGPT appears to perform optimally with English inputs, language emerges as a limiting factor for its accuracy, suggesting that globally consistent application is dependent upon users' proficiency in English.</p>
        <p>Moreover, the nearly 30% performance increase from GPT-3.5 to GPT-4, as indicated in this study and supported by a Japanese study, which suggests a similar language distribution within the GPT-3.5 and GPT-4 data sets [<xref ref-type="bibr" rid="ref15">15</xref>]. GPT-4, unlike GPT-3.5, also did not answer questions containing images on repetition, showing an improvement in the previously incorrect content produced by GPT-4’s predecessor [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>Thus, health care professionals could potentially benefit, especially from GPT-4’s conclusive and often nonobvious insights to multiple-choice questions, as these users have the ability to verify crucial details [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. For instance, there is potential for using GPT-3.5 and GPT-4 in a medical education tutoring environment, as evidenced by its successful application in anatomy [<xref ref-type="bibr" rid="ref47">47</xref>]. However, when using either GPT-3.5 or GPT-4 for medical applications, its differing accuracy across specialties must also be taken into account [<xref ref-type="bibr" rid="ref48">48</xref>]. GPT-3.5 initially displayed a high degree of accuracy within the field of psychiatry, while GPT-4 demonstrated its strength in internal medicine and surgery. Considering the rising prevalence of psychiatric disorders and concomitant challenges in providing care, it seemed likely that nonprofessionals would also turn to the chatbot for mental health issues at the time of GPT-3.5’s release [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Hence, it is conceivable that GPT-3.5’s training data set includes not only a substantial and reliable portion of psychiatric data, but also its developers might have first fine-tuned ChatGPT specifically in this domain in anticipation of its high demand [<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref53">53</xref>]. Thus, the developers might have also fine-tuned GPT-4 specifically in internal medicine and surgery, possibly reacting to a high demand in this area from users of its’ predecessor. GPT-4’s impressive performance is not limited to the medical field, as it demonstrated comparable percentile scores in the Uniform Bar Exam, showcasing it potential as a versatile tool across diverse academic disciplines [<xref ref-type="bibr" rid="ref17">17</xref>]. However, assessing the possible reasons for the performance differences between GPT-3.5 and GPT-4 is complicated by the confidential architecture of GPT-4 [<xref ref-type="bibr" rid="ref54">54</xref>], posing challenges for research on future applications.</p>
        <p>In turn, GPT-4’s excellent achievements shed light on the limitations of current testing paradigms in medical education that often favor rote memorization over a critical and context-aware approach. They also highlight the inadequacy of multiple-choice questions as a means of assessing medical knowledge, as they tend to encourage binary thinking as “true” and “false,” which often fails to capture the complex reality of the medical practice [<xref ref-type="bibr" rid="ref11">11</xref>]. Although GPT-3.5 and GPT-4 allow the simple and fast retrieval of medical information from any internet-capable device that fits in one's pocket [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], neither GPT-3.5 nor GPT-4 verifies the information they provide. Thus, ChatGPT's output needs to be approached with a critical mindset, recognizing that misinformation may be more difficult to detect than in the output of other search engines that offer multiple sources in response to a query and take login credentials into account [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. To navigate these changing informational landscapes, a basic understanding in data science seems necessary alongside traditional medical expertise [<xref ref-type="bibr" rid="ref56">56</xref>]. It may even be beneficial for future iterations of AI tools to include references to the sources underlying each search in order to increase transparency and allow users to assess the reliability of the information they receive.</p>
        <p>In a previous study by Nov et al [<xref ref-type="bibr" rid="ref57">57</xref>], considering that 59% of participants trusted chatbots more than traditional search engines, it must be noted that GPT-3.5 and GPT-4 have only been tested on medical examination questions and not questions by nonprofessionals, limiting general recommendations for unsupervised patient education or the general population. It seems evident that GPT-4 has been benchmarked against medical licensing examinations, explaining not only GPT-4’s excellent scores but also exceeding achievements in internal medicine and surgery, which, for instance, have been overrepresented in the medical examinations assessed in this study [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>Since GPT-3.5 failed the German medical licensing examination by a narrow margin, its use for answering medical questions is generally not advisable. Moreover, the remarkable performance of GPT-4 in the German Medical State Examination may not be universally applicable outside a medical examination setting, especially considering that GPT-4 was presumably benchmarked on academic and professional examinations [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>As literature on ChatGPT is scarce, and it can be difficult to detect incorrect output from this AI tool, the content it generates must be carefully assessed. Nevertheless, medical professionals may still be able to benefit from GPT-3.5’s and GPT-4’s explanations and, in some cases, gain new nonobvious insights. With the release of GPT-4’s ability to handle pictures on the horizon, the potential for further applications of GPT-3.5 and GPT-4 to improve the medical workflow or medical education seems eminent, emphasizing the need for continued research into AI.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study’s findings on GPT-3.5’s and GPT-4’s medical proficiencies are limited to multiple-choice questions from the German medical licensing examination, which may not be representative of other types of examinations or contexts. However, it is worth noting that GPT-3.5 and GPT-4 have demonstrated similar performances in examinations in other countries and languages, which suggests some degree of generalizability.</p>
        <p>In addition, the sample size of 937 questions and the exclusion of image-based questions may not capture the full range of difficulty levels or content areas. Although the collected parameters did not differ in terms of GPT-3.5’s and GPT-4’s accuracy in the excluded cases, the decision to exclude image-based questions may have introduced a sampling bias. By testing for differences, efforts were made to minimize this bias and maintain the integrity of the results.</p>
        <p>As GPT-3.5’s and GPT-4’s performances were compared to those of German medical students using the MEDI-LEARN service, a selection bias might have been introduced. However, the high correlation between the MEDI-LEARN statistics and the IMPP statistics indicates at best a weak expression of this selection bias [<xref ref-type="bibr" rid="ref58">58</xref>].</p>
        <p>It should also be noted that a replication of this study might not yield the exact same results, as the literature suggests that GPT-3.5 is inconsistent in answering 15% of medical questions [<xref ref-type="bibr" rid="ref59">59</xref>]. However, the trends observed in this study appear to be consistent with those reported in other published and preprint studies on GPT-3.5’s and GPT-4’s performance.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, the results of this study indicate that only GPT-4 consistently passed all 3 medical examinations, ranking in the 92.8th to 99.5th percentile in comparison to medical students. These findings highlight the strengths and limitations of ChatGPT in the context of medical examinations and raise questions about the future of medical education.</p>
        <p>Although GPT-3.5’s and GPT-4’s accuracy in medical examinations seems consistent across different countries and languages, its inconsistencies, potential biases, and number of incorrect answers restrain a recommendation for its use by the general population for medical purposes. However, its elaborate explanations and potential to yield nonobvious insights may benefit medical professionals in training.</p>
        <p>While this study hints to a moderate accuracy of GPT-3.5 and a stellar performance of GPT-4 in answering medical examination questions, further research is necessary to gain deeper insights, explore future applications, and ensure safe use of ChatGPT for end users.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Responses of (A) GPT-3.5 and (B) GPT-4 to the queries on its ability to pass a medical exam, 2023.</p>
        <media xlink:href="mededu_v10i1e50965_app1.docx" xlink:title="DOCX File , 592 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Summary statistics for all questions regarding exam time and ChatGPT’s (GPT-3.5 and GPT-4) accuracy in the German medical licensing exam, 2021-2022.</p>
        <media xlink:href="mededu_v10i1e50965_app2.docx" xlink:title="DOCX File , 21 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Summary statistics for excluded questions regarding ChatGPT’s (GPT-3.5 and GPT-4) accuracy in the German medical licensing exam, 2021-2022.</p>
        <media xlink:href="mededu_v10i1e50965_app3.docx" xlink:title="DOCX File , 20 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Summary statistics for included questions regarding exam time in the German medical licensing exam, 2021-2022.</p>
        <media xlink:href="mededu_v10i1e50965_app4.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">IMPP</term>
          <def>
            <p>Institut für Medizinische und Pharmazeutische Prüfungsfragen</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">SMOG</term>
          <def>
            <p>Simple Measure of Gobbledygook</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors thank Dorothee Meyer, Linea Luise Fuchs, Ari Soleman, GPT-3.5, and GPT-4 for proofreading this manuscript. In this study, we used ChatGPT for several purposes: to translate our manuscript into English, to refine its linguistic presentation, to evaluate and improve our methodological approach, and to scrutinize the R code underlying our statistical analysis, with a particular focus on identifying and resolving any error warnings generated. Subsequently, all outputs provided by ChatGPT were rigorously reviewed and critically appraised by the authors to ensure accuracy and reliability.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Heacock</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elias</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hentel</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Reig</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Shih</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Moy</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and other large language models are double-edged swords</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>04</month>
          <volume>307</volume>
          <issue>2</issue>
          <fpage>e230163</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230163</pub-id>
          <pub-id pub-id-type="medline">36700838</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>The Lancet Digital Health</collab>
          </person-group>
          <article-title>ChatGPT: friend or foe?</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>e102</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(23)00023-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00023-7</pub-id>
          <pub-id pub-id-type="medline">36754723</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00023-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liebrenz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schleifer</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Buadze</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bhugra</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Generating scholarly content with ChatGPT: ethical challenges for medical publishing</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>e105</fpage>
          <lpage>e106</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://boris.unibe.ch/id/eprint/178562"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00019-5</pub-id>
          <pub-id pub-id-type="medline">36754725</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00019-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: the future of discharge summaries?</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>e107</fpage>
          <lpage>e108</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(23)00021-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00021-3</pub-id>
          <pub-id pub-id-type="medline">36754724</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00021-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeblick</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schachtner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dexl</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mittermeier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stüber</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Topalis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wesp</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sabel</surname>
              <given-names>BO</given-names>
            </name>
            <name name-style="western">
              <surname>Ricke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ingrisch</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title>
          <source>Eur Radiol</source>
          <year>2023</year>
          <month>10</month>
          <day>05</day>
          <pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id>
          <pub-id pub-id-type="medline">37794249</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-023-10213-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Macdonald</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Adeloye</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rudan</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT draft a research article? An example of population-level vaccine effectiveness analysis</article-title>
          <source>J Glob Health</source>
          <year>2023</year>
          <month>02</month>
          <day>17</day>
          <volume>13</volume>
          <fpage>01003</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36798998"/>
          </comment>
          <pub-id pub-id-type="doi">10.7189/jogh.13.01003</pub-id>
          <pub-id pub-id-type="medline">36798998</pub-id>
          <pub-id pub-id-type="pmcid">PMC9936200</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Markov</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>AT</given-names>
            </name>
          </person-group>
          <article-title>Comparing scientific abstracts generated by ChatGPT to real abstracts with detectors and blinded human reviewers</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <month>04</month>
          <day>26</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>75</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00819-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00819-6</pub-id>
          <pub-id pub-id-type="medline">37100871</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00819-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC10133283</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kurz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: Noch kein Allheilmittel</article-title>
          <source>Dtsch Arztebl International</source>
          <year>2023</year>
          <volume>120</volume>
          <issue>6</issue>
          <fpage>A-230</fpage>
          <lpage>B-202</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ahn</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Exploring ChatGPT for information of cardiopulmonary resuscitation</article-title>
          <source>Resuscitation</source>
          <year>2023</year>
          <month>04</month>
          <volume>185</volume>
          <fpage>109729</fpage>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109729</pub-id>
          <pub-id pub-id-type="medline">36773836</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00042-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aljanabi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghazi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Abed</surname>
              <given-names>SA</given-names>
            </name>
            <collab>ChatGpt</collab>
          </person-group>
          <article-title>ChatGpt: open possibilities</article-title>
          <source>Iraqi J Comp Sci Math</source>
          <year>2023</year>
          <month>1</month>
          <day>18</day>
          <fpage>62</fpage>
          <lpage>64</lpage>
          <pub-id pub-id-type="doi">10.52866/20ijcsm.2023.01.01.0018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mbakwe</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Lourentzou</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mechanic</surname>
              <given-names>OJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dagan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT passing USMLE shines a spotlight on the flaws of medical education</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000205</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812618"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000205</pub-id>
          <pub-id pub-id-type="medline">36812618</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-23-00027</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931307</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sanderson</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 is here: what scientists think</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>03</month>
          <volume>615</volume>
          <issue>7954</issue>
          <fpage>773</fpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00816-5</pub-id>
          <pub-id pub-id-type="medline">36928404</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00816-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takagi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Watari</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Erabi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sakaguchi</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: comparison study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>06</month>
          <day>29</day>
          <volume>9</volume>
          <fpage>e48002</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48002/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48002</pub-id>
          <pub-id pub-id-type="medline">37384388</pub-id>
          <pub-id pub-id-type="pii">v9i1e48002</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365615</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gabriel</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatia</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Lost in Translation: Large Language Models in Non-English Content Analysis</article-title>
          <source>Center for Democracy &#38; Technology</source>
          <year>2023</year>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cdt.org/insights/lost-in-translation-large-language-models-in-non-english-content-analysis/">https://cdt.org/insights/lost-in-translation-large-language-models-in-non-english-content-analysis/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
          </person-group>
          <source>GPT-4 Technical Report</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cdn.openai.com/papers/gpt-4.pdf">https://cdn.openai.com/papers/gpt-4.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Institut für medizinische und pharmazeutische Prüfungsfragen</collab>
          </person-group>
          <article-title>Zusammenstellung der Prüfungsinhalte für den Zweiten Abschnitt der Ärztlichen Prüfung („Blueprint“) nach derzeit gültiger ÄApprO 2002</article-title>
          <source>IMPP</source>
          <access-date>2023-11-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.impp.de/blueprint-m2-examen.html?file=fi-">https://www.impp.de/blueprint-m2-examen.html?file=fi-</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jünger</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Kompetenzorientiert prüfen im Staatsexamen Medizin [Competence-based assessment in the national licensing examination in Germany]</article-title>
          <source>Bundesgesundheitsblatt Gesundheitsforschung Gesundheitsschutz</source>
          <year>2018</year>
          <month>02</month>
          <day>11</day>
          <volume>61</volume>
          <issue>2</issue>
          <fpage>171</fpage>
          <lpage>177</lpage>
          <pub-id pub-id-type="doi">10.1007/s00103-017-2668-9</pub-id>
          <pub-id pub-id-type="medline">29230515</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00103-017-2668-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <article-title>Examen (M2/M3) No.1 in der Examensvorbereitung</article-title>
          <source>AMBOSS</source>
          <access-date>2023-07-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.amboss.com/de/examen-m2-m3">https://www.amboss.com/de/examen-m2-m3</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grabeel</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Russomanno</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Oelschlegel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tester</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Heidel</surname>
              <given-names>RE</given-names>
            </name>
          </person-group>
          <article-title>Computerized versus hand-scored health literacy tools: a comparison of Simple Measure of Gobbledygook (SMOG) and Flesch-Kincaid in printed patient education materials</article-title>
          <source>J Med Libr Assoc</source>
          <year>2018</year>
          <month>01</month>
          <volume>106</volume>
          <issue>1</issue>
          <fpage>38</fpage>
          <lpage>45</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29339932"/>
          </comment>
          <pub-id pub-id-type="doi">10.5195/jmla.2018.262</pub-id>
          <pub-id pub-id-type="medline">29339932</pub-id>
          <pub-id pub-id-type="pii">jmla-106-38</pub-id>
          <pub-id pub-id-type="pmcid">PMC5764592</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>here: A Simpler Way to Find Your Files</article-title>
          <source>here</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://here.r-lib.org/">https://here.r-lib.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Chan CH</collab>
            <collab>Leeper TJ</collab>
            <collab>Becker J</collab>
          </person-group>
          <source>rio: A Swiss-Army Knife for Data I/O</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/rio/readme/README.html#:~:text=Overview,or%20a%20specified%20format%20argument)">https://cran.r-project.org/web/packages/rio/readme/README.html#:~:text=Overview,or%20a%20specified%20format%20argument)</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Easily Install and Load the Tidyverse</article-title>
          <source>tidyverse</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tidyverse.tidyverse.org/">https://tidyverse.tidyverse.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Averick</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bryan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>McGowan</surname>
              <given-names>LD</given-names>
            </name>
            <name name-style="western">
              <surname>François</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Grolemund</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hayes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hester</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bache</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ooms</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Seidel</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Spinu</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Takahashi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Vaughan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wilke</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Woo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yutani</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Welcome to the Tidyverse</article-title>
          <source>JOSS</source>
          <year>2019</year>
          <volume>4</volume>
          <issue>43</issue>
          <fpage>1686</fpage>
          <pub-id pub-id-type="doi">10.21105/joss.01686</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hayes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Couch</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>broom: Convert Statistical Objects into Tidy Tibbles</article-title>
          <source>broom</source>
          <year>2023</year>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://broom.tidymodels.org/">https://broom.tidymodels.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Larmarange</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>labelled: Manipulating Labelled Data</article-title>
          <source>labelled</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://larmarange.github.io/labelled/">https://larmarange.github.io/labelled/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sjoberg</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Curry</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lavery</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Larmarange</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Reproducible Summary Tables with the gtsummary Package</article-title>
          <source>The R Journal</source>
          <year>2021</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>570-580</fpage>
          <pub-id pub-id-type="doi">10.32614/RJ-2021-053</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sjoberg</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Curry</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lavery</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Larmarange</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Reproducible Summary Tables with the gtsummary Package</article-title>
          <source>The R Journal</source>
          <year>2021</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>570</fpage>
          <pub-id pub-id-type="doi">10.32614/rj-2021-053</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kassambara</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ggpubr: ‘ggplot2’ Based Publication Ready Plots</article-title>
          <source>ggpubr</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://rpkgs.datanovia.com/ggpubr/">https://rpkgs.datanovia.com/ggpubr/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Robin</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Turck</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hainard</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tiberti</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lisacek</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>Markus</given-names>
            </name>
          </person-group>
          <article-title>pROC: an open-source package for R and S+ to analyze and compare ROC curves</article-title>
          <source>BMC Bioinformatics</source>
          <year>2011</year>
          <month>03</month>
          <day>17</day>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>77</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-77"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-12-77</pub-id>
          <pub-id pub-id-type="medline">21414208</pub-id>
          <pub-id pub-id-type="pii">1471-2105-12-77</pub-id>
          <pub-id pub-id-type="pmcid">PMC3068975</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Robin</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Turck</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hainard</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tiberti</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lisacek</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>Markus</given-names>
            </name>
          </person-group>
          <article-title>pROC: an open-source package for R and S+ to analyze and compare ROC curves</article-title>
          <source>BMC Bioinformatics</source>
          <year>2011</year>
          <month>03</month>
          <day>17</day>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>77</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-77"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-12-77</pub-id>
          <pub-id pub-id-type="medline">21414208</pub-id>
          <pub-id pub-id-type="pii">1471-2105-12-77</pub-id>
          <pub-id pub-id-type="pmcid">PMC3068975</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <source>ggplot2: Elegant Graphics for Data Analysis</source>
          <year>2016</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Takahashi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wilke</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Woo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yutani</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dunnington</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics</article-title>
          <source>ggplot2</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ggplot2.tidyverse.org/reference/ggplot2-package.html">https://ggplot2.tidyverse.org/reference/ggplot2-package.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilke</surname>
              <given-names>CO</given-names>
            </name>
          </person-group>
          <article-title>cowplot – Streamlined plot theme and plot annotations for ggplot2</article-title>
          <source>cowplot</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://wilkelab.org/cowplot/">https://wilkelab.org/cowplot/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lüdecke</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>sjPlot - Data Visualization for Statistics in Social Science</article-title>
          <source>sjPlot</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://strengejacke.github.io/sjPlot/">https://strengejacke.github.io/sjPlot/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dietrich</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Leoncio</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>citation: Software Citation Tools</article-title>
          <source>Zenodo</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://zenodo.org/records/3909438">https://zenodo.org/records/3909438</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>The R Development Core Team</collab>
          </person-group>
          <source>R: A Language and Environment for Statistical Computing</source>
          <year>2008</year>
          <publisher-loc>Vienna, Austria</publisher-loc>
          <publisher-name>R Foundation for Statistical Computing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="web">
          <article-title>Herbst 2021 - Ergebnisinformartion</article-title>
          <source>Institut für medizinische und pharmazeutische Prüfungsfragen</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.impp.de/pruefungen/medizin/archiv-medizin.html?file=files/PDF/Pr%C3%BCfungsergebnisse/Pr%C3%BCfungsergebnisse/ErgMedM2H2021APPO2012.pdf">https://www.impp.de/pruefungen/medizin/archiv-medizin.html?file=files/PDF/Pr%C3%BCfungsergebnisse/Pr%C3%BCfungsergebnisse/ErgMedM2H2021APPO2012.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van Dis</surname>
              <given-names>EAM</given-names>
            </name>
            <name name-style="western">
              <surname>Bollen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zuidema</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>van Rooij</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bockting</surname>
              <given-names>CL</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: five priorities for research</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>02</month>
          <volume>614</volume>
          <issue>7947</issue>
          <fpage>224</fpage>
          <lpage>226</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00288-7</pub-id>
          <pub-id pub-id-type="medline">36737653</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00288-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Are ChatGPT’s knowledge and interpretation ability comparable to those of medical students in Korea for taking a parasitology examination?: a descriptive study</article-title>
          <source>J Educ Eval Health Prof</source>
          <year>2023</year>
          <volume>20</volume>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36627845"/>
          </comment>
          <pub-id pub-id-type="doi">10.3352/jeehp.2023.20.1</pub-id>
          <pub-id pub-id-type="medline">36627845</pub-id>
          <pub-id pub-id-type="pii">jeehp.2023.20.1</pub-id>
          <pub-id pub-id-type="pmcid">PMC9905868</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Gudera</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegand</surname>
              <given-names>TLT</given-names>
            </name>
            <name name-style="western">
              <surname>Allmendinger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dimitriadis</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Koerte</surname>
              <given-names>IK</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT besteht schriftliche medizinische Staatsexamina nach Ausschluss der Bildfragen</article-title>
          <source>Dtsch Arztebl International</source>
          <year>2023</year>
          <volume>120</volume>
          <fpage>373</fpage>
          <lpage>374</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abhinav</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jonathan</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Carbin</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>BioMedLM: a Domain-Specific Large Language Model for Biomedical Text</article-title>
          <source>Mosaic ML</source>
          <year>2023</year>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mosaicml.com/blog/introducing-pubmed-gpt">https://www.mosaicml.com/blog/introducing-pubmed-gpt</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Oufattole</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title>
          <source>Applied Sciences</source>
          <year>2021</year>
          <month>07</month>
          <day>12</day>
          <volume>11</volume>
          <issue>14</issue>
          <fpage>6421</fpage>
          <pub-id pub-id-type="doi">10.3390/app11146421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <article-title>Frühjahr 2022 - Ergebnisinformartion</article-title>
          <source>Institut für medizinische und pharmazeutische Prüfungsfragen</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.impp.de/pruefungen/medizin/archiv-medizin.html?file=files/PDF/Pr%C3%BCfungsergebnisse/Pr%C3%BCfungsergebnisse/ErgMedM2F2022APPO2012.pdf">https://www.impp.de/pruefungen/medizin/archiv-medizin.html?file=files/PDF/Pr%C3%BCfungsergebnisse/Pr%C3%BCfungsergebnisse/ErgMedM2F2022APPO2012.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="web">
          <article-title>Herbst 2022 - Ergebnisinformartion</article-title>
          <source>Institut für medizinische und pharmazeutische Prüfungsfragen</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.impp.de/pruefungen/medizin/archiv-medizin.html?file=files/PDF/Prüfungsergebnisse/Prüfungsergebnisse/ErgMedM2H2022.pdf">https://www.impp.de/pruefungen/medizin/archiv-medizin.html?file=files/PDF/Prüfungsergebnisse/Prüfungsergebnisse/ErgMedM2H2022.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mogali</surname>
              <given-names>SR</given-names>
            </name>
          </person-group>
          <article-title>Initial impressions of ChatGPT for anatomy education</article-title>
          <source>Anat Sci Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>07</day>
          <fpage>n/a</fpage>
          <pub-id pub-id-type="doi">10.1002/ase.2261</pub-id>
          <pub-id pub-id-type="medline">36749034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sanghera</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Barzangi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>El Mukashfi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Trialling a large language model (ChatGPT) in general practice with the applied knowledge test: observational study demonstrating opportunities and limitations in primary care</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>04</month>
          <day>21</day>
          <volume>9</volume>
          <fpage>e46599</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46599/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46599</pub-id>
          <pub-id pub-id-type="medline">37083633</pub-id>
          <pub-id pub-id-type="pii">v9i1e46599</pub-id>
          <pub-id pub-id-type="pmcid">PMC10163403</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>GBD 2019 Mental Disorders Collaborators</collab>
          </person-group>
          <article-title>Global, regional, and national burden of 12 mental disorders in 204 countries and territories, 1990-2019: a systematic analysis for the Global Burden of Disease Study 2019</article-title>
          <source>Lancet Psychiatry</source>
          <year>2022</year>
          <month>02</month>
          <volume>9</volume>
          <issue>2</issue>
          <fpage>137</fpage>
          <lpage>150</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2215-0366(21)00395-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2215-0366(21)00395-3</pub-id>
          <pub-id pub-id-type="medline">35026139</pub-id>
          <pub-id pub-id-type="pii">S2215-0366(21)00395-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC8776563</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>KI-Chatbot könnte Therapiegespräche empathischer machen</article-title>
          <source>aerzteblatt.de</source>
          <access-date>2023-03-14</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aerzteblatt.de/nachrichten/140445/KI-Chatbot-koennte-Therapiegespraeche-empathischer-machen">https://www.aerzteblatt.de/nachrichten/140445/KI-Chatbot-koennte-Therapiegespraeche-empathischer-machen</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Budler</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Gosak</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stiglic</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Review of artificial intelligence‐based question‐answering systems in healthcare</article-title>
          <source>WIREs Data Mining and Knowledge Discovery</source>
          <year>2023</year>
          <month>01</month>
          <day>10</day>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>e1487</fpage>
          <pub-id pub-id-type="doi">10.1002/widm.1487</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perlis</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Iosifescu</surname>
              <given-names>DV</given-names>
            </name>
            <name name-style="western">
              <surname>Castro</surname>
              <given-names>VM</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Gainer</surname>
              <given-names>VS</given-names>
            </name>
            <name name-style="western">
              <surname>Minnier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Goryachev</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Gallagher</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Fava</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Weilburg</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Churchill</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>IS</given-names>
            </name>
            <name name-style="western">
              <surname>Smoller</surname>
              <given-names>JW</given-names>
            </name>
          </person-group>
          <article-title>Using electronic medical records to enable large-scale studies in psychiatry: treatment resistant depression as a model</article-title>
          <source>Psychol Med</source>
          <year>2011</year>
          <month>06</month>
          <day>20</day>
          <volume>42</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <lpage>50</lpage>
          <pub-id pub-id-type="doi">10.1017/s0033291711000997</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Van Le</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Montgomery</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kirkby</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Scanlan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Risk prediction using natural language processing of electronic mental health records in an inpatient forensic psychiatry setting</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>10</month>
          <volume>86</volume>
          <fpage>49</fpage>
          <lpage>58</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30162-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.08.007</pub-id>
          <pub-id pub-id-type="medline">30118855</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30162-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elangovan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medicine</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <month>08</month>
          <day>17</day>
          <volume>29</volume>
          <issue>8</issue>
          <fpage>1930</fpage>
          <lpage>1940</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
          <pub-id pub-id-type="medline">37460753</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02448-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Epstein</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Dexter</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Variability in large language models' responses to medical licensing and certification examinations. Comment on "How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment"</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>07</month>
          <day>13</day>
          <volume>9</volume>
          <fpage>e48305</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48305/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48305</pub-id>
          <pub-id pub-id-type="medline">37440293</pub-id>
          <pub-id pub-id-type="pii">v9i1e48305</pub-id>
          <pub-id pub-id-type="pmcid">PMC10375390</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seth</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hueppchen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Rudzicz</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Parakh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Record</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>Data science as a core competency in undergraduate medical education in the age of artificial intelligence in health care</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>07</month>
          <day>11</day>
          <volume>9</volume>
          <fpage>e46344</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46344/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46344</pub-id>
          <pub-id pub-id-type="medline">37432728</pub-id>
          <pub-id pub-id-type="pii">v9i1e46344</pub-id>
          <pub-id pub-id-type="pmcid">PMC10369309</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nov</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Putting ChatGPT's medical advice to the (Turing) test: survey study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>07</month>
          <day>10</day>
          <volume>9</volume>
          <fpage>e46939</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46939/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46939</pub-id>
          <pub-id pub-id-type="medline">37428540</pub-id>
          <pub-id pub-id-type="pii">v9i1e46939</pub-id>
          <pub-id pub-id-type="pmcid">PMC10366957</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="web">
          <article-title>FAQ: Häufig gefrage Fragen</article-title>
          <source>MEDI-LEARN</source>
          <access-date>2024-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mlmr.de/unis/faq/#faq1">https://www.mlmr.de/unis/faq/#faq1</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Duong</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Solomon</surname>
              <given-names>BD</given-names>
            </name>
          </person-group>
          <article-title>Analysis of large-language model versus human performance for genetics questions</article-title>
          <source>Eur J Hum Genet</source>
          <year>2023</year>
          <month>05</month>
          <day>29</day>
          <fpage>2023</fpage>
          <pub-id pub-id-type="doi">10.1038/s41431-023-01396-8</pub-id>
          <pub-id pub-id-type="medline">37246194</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41431-023-01396-8</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
