<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i1e51391</article-id>
      <article-id pub-id-type="pmid">38349725</article-id>
      <article-id pub-id-type="doi">10.2196/51391</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Learning to Make Rare and Complex Diagnoses With Generative AI Assistance: Qualitative Study of Popular Large Language Models</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Venkatesh</surname>
            <given-names>Kaushik</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Kamel Boulos</surname>
            <given-names>Maged N.</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Modersohn</surname>
            <given-names>Luise</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ghanvatkar</surname>
            <given-names>Suparna</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Abdullahi</surname>
            <given-names>Tassallah</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6572-2707</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Singh</surname>
            <given-names>Ritambhara</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7523-160X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Eickhoff</surname>
            <given-names>Carsten</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>School of Medicine</institution>
            <institution>University of Tübingen</institution>
            <addr-line>Schaffhausenstr, 77</addr-line>
            <addr-line>Tübingen, 72072</addr-line>
            <country>Germany</country>
            <phone>49 7071 29 843</phone>
            <email>carsten.eickhoff@uni-tuebingen.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9895-4061</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science</institution>
        <institution>Brown University</institution>
        <addr-line>Providence, RI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Computational Molecular Biology</institution>
        <institution>Brown University</institution>
        <addr-line>Providence, RI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>School of Medicine</institution>
        <institution>University of Tübingen</institution>
        <addr-line>Tübingen</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Carsten Eickhoff <email>carsten.eickhoff@uni-tuebingen.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>2</month>
        <year>2024</year>
      </pub-date>
      <volume>10</volume>
      <elocation-id>e51391</elocation-id>
      <history>
        <date date-type="received">
          <day>30</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>7</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>11</day>
          <month>12</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Tassallah Abdullahi, Ritambhara Singh, Carsten Eickhoff. Originally published in JMIR Medical Education (https://mededu.jmir.org), 13.02.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2024/1/e51391" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Patients with rare and complex diseases often experience delayed diagnoses and misdiagnoses because comprehensive knowledge about these diseases is limited to only a few medical experts. In this context, large language models (LLMs) have emerged as powerful knowledge aggregation tools with applications in clinical decision support and education domains.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to explore the potential of 3 popular LLMs, namely Bard (Google LLC), ChatGPT-3.5 (OpenAI), and GPT-4 (OpenAI), in medical education to enhance the diagnosis of rare and complex diseases while investigating the impact of prompt engineering on their performance.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We conducted experiments on publicly available complex and rare cases to achieve these objectives. We implemented various prompt strategies to evaluate the performance of these models using both open-ended and multiple-choice prompts. In addition, we used a majority voting strategy to leverage diverse reasoning paths within language models, aiming to enhance their reliability. Furthermore, we compared their performance with the performance of human respondents and MedAlpaca, a generative LLM specifically designed for medical tasks.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Notably, all LLMs outperformed the average human consensus and MedAlpaca, with a minimum margin of 5% and 13%, respectively, across all 30 cases from the diagnostic case challenge collection. On the frequently misdiagnosed cases category, Bard tied with MedAlpaca but surpassed the human average consensus by 14%, whereas GPT-4 and ChatGPT-3.5 outperformed MedAlpaca and the human respondents on the moderately often misdiagnosed cases category with minimum accuracy scores of 28% and 11%, respectively. The majority voting strategy, particularly with GPT-4, demonstrated the highest overall score across all cases from the diagnostic complex case collection, surpassing that of other LLMs. On the Medical Information Mart for Intensive Care-III data sets, Bard and GPT-4 achieved the highest diagnostic accuracy scores, with multiple-choice prompts scoring 93%, whereas ChatGPT-3.5 and MedAlpaca scored 73% and 47%, respectively. Furthermore, our results demonstrate that there is no one-size-fits-all prompting approach for improving the performance of LLMs and that a single strategy does not universally apply to all LLMs.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our findings shed light on the diagnostic capabilities of LLMs and the challenges associated with identifying an optimal prompting strategy that aligns with each language model’s characteristics and specific task requirements. The significance of prompt engineering is highlighted, providing valuable insights for researchers and practitioners who use these language models for medical training. Furthermore, this study represents a crucial step toward understanding how LLMs can enhance diagnostic reasoning in rare and complex medical cases, paving the way for developing effective educational tools and accurate diagnostic aids to improve patient care and outcomes.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>clinical decision support</kwd>
        <kwd>rare diseases</kwd>
        <kwd>complex diseases</kwd>
        <kwd>prompt engineering</kwd>
        <kwd>reliability</kwd>
        <kwd>consistency</kwd>
        <kwd>natural language processing</kwd>
        <kwd>language model</kwd>
        <kwd>Bard</kwd>
        <kwd>ChatGPT 3.5</kwd>
        <kwd>GPT-4</kwd>
        <kwd>MedAlpaca</kwd>
        <kwd>medical education</kwd>
        <kwd>complex diagnosis</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI assistance</kwd>
        <kwd>medical training</kwd>
        <kwd>prediction model</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Natural language processing has witnessed remarkable advances with the introduction of generative large language models (LLMs). In November 2022, OpenAI released ChatGPT-3.5 (OpenAI), a large natural language processing chatbot trained on a large corpus collected from the internet to generate humanlike text in response to user queries. ChatGPT-3.5 has seen massive popularity, and users have praised its creativity and language comprehension for several tasks, such as text summarization and writing computer programs [<xref ref-type="bibr" rid="ref1">1</xref>]. In March 2023, OpenAI responded to the success of ChatGPT-3.5 by introducing an enhanced iteration called GPT-4, specifically designed to address intricate queries and nuanced directives more effectively. Shortly thereafter, Google released their comparable model, Bard (Google LLC), which joined the league of impressive LLMs. What sets Bard apart is its real-time access to and use of internet information, enriching its response generation with up-to-date information [<xref ref-type="bibr" rid="ref2">2</xref>]. In contrast, GPT-4 possesses multimodal capabilities, including image inputs, albeit not publicly available during the study [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        <p>These LLMs were not originally designed for medical applications. However, several studies [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>] have shown their extraordinary capabilities in excelling in various medical examinations, such as the Self-Assessment in Neurological Surgery examination and the USMLE (United States Medical Licensing Examination). Their results demonstrated the ability of these models to handle clinical information and complex counterfactuals. Furthermore, numerous investigations [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>] have revealed the remarkable advantages of harnessing the power of LLMs in diverse medical scenarios. Notably, Lee et al [<xref ref-type="bibr" rid="ref8">8</xref>] demonstrated using LLMs as a reliable conversational agent to collect patient information to assist in medical notetaking, whereas Patel and Lam [<xref ref-type="bibr" rid="ref9">9</xref>] delved into using LLMs as a valuable tool for generating comprehensive patient discharge summaries. The ability of LLMs to process and generate medical text has unlocked new opportunities to enhance diagnostic reasoning, particularly in tackling rare and complex medical cases.</p>
        <p>Rare diseases are characterized by their low prevalence in the general population, whereas complex diseases are conditions with overlapping factors and multiple comorbidities that are often difficult to diagnose [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Sometimes, a condition can be rare and complex if it is infrequent and challenging to diagnose accurately [<xref ref-type="bibr" rid="ref11">11</xref>]. Rare and complex diagnoses present significant challenges across various medical levels and often require extensive medical knowledge or expertise for accurate diagnosis and management [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. This may be because, during their education, physicians are trained to prioritize ruling out common diagnoses before considering rare ones during patient evaluation [<xref ref-type="bibr" rid="ref12">12</xref>]. In addition, most medical education programs rarely cover some complex conditions, and guidance for practicing clinicians is often outdated and inappropriate [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. As a result, most physicians perceive their knowledge of rare diseases as insufficient or very poor, and only a few feel adequately prepared to care for patients with these conditions [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. This knowledge gap increases the risk of misdiagnosis among individuals with rare and complex conditions. Furthermore, the scarcity of available data and the relatively small number of affected individuals create a complicated diagnostic landscape, even for experienced and specialized clinicians [<xref ref-type="bibr" rid="ref10">10</xref>]. Consequently, patients often endure a prolonged and arduous diagnostic process. Therefore, there is a pressing need for comprehensive educational tools and accurate diagnostic aids to fill the knowledge gap and address these challenges effectively.</p>
        <p>This study aims to explore the potential of 3 LLMs, namely Bard, GPT-4, and ChatGPT-3.5, as continuing medical education (CME) systems to enhance the diagnoses of rare and complex conditions. Although these models have demonstrated impressive success in standardized medical examinations [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], it is important to acknowledge that most examinations reflect general clinical situations, which may not fully capture the intricacies encountered in real-world diagnostic scenarios. Furthermore, these standardized tests often feature questions that can be answered through memorization [<xref ref-type="bibr" rid="ref16">16</xref>]. In contrast, real-world complex diagnostic scenarios that physicians face involve dynamic, multifaceted patient cases with numerous variables and uncertainties. Although previous studies by Liu et al [<xref ref-type="bibr" rid="ref17">17</xref>] and Cascella et al [<xref ref-type="bibr" rid="ref18">18</xref>] have highlighted the ability of LLMs to support health care professionals in real-world scenarios, their effectiveness in diagnosing rare and complex conditions remains an area of exploration. Despite the promising use of LLMs in medical applications, studies have reported that their responses to user queries are often nondeterministic (ie, depending on the query format) and exhibit significant variance [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. This attribute may pose challenges in clinical decision support scenarios because the dependability of a system is uncertain when its behavior cannot be accurately predicted. However, no investigation has been conducted to show how different input formats (prompts) affect LLM responses in the medical context.</p>
        <p>Prompt engineering is a technique for carefully designing queries (inputs) to improve the performance of generative language models [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. We can guide LLMs to generate more accurate and reliable responses by carefully crafting effective prompts. Our study investigated effective prompting strategies to improve the accuracy and reliability of LLMs in diagnosing rare and complex conditions within an educational context. We evaluated the performance of LLMs by comparing their responses to those of human respondents and the responses of MedAlpaca [<xref ref-type="bibr" rid="ref22">22</xref>], an open-source generative LLM designed for medical tasks. Given the documented advantages of using LLMs as a complementary tool rather than a substitute for clinicians [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], our study incorporated LLMs with the understanding that clinicians may use them beyond real-time diagnostic scenarios. Although our premise is based on a clinician having established an initial diagnostic hypothesis and seeking further assistance to refine the precise diagnosis, we acknowledge the broader utility of LLMs. They can be valuable in real-time decision support and retrospective use during leisure or documentation, allowing physicians to experiment with and enhance their understanding of rare and complex diseases. This approach recognizes the inherent uncertainty in diagnosis and harnesses the capabilities of LLMs to assist clinicians in various aspects of their diagnostic processes. In the context of CME, our study highlights the possibility of integrating LLMs as a valuable addition. By providing further assistance in refining complex and rare diagnoses, these LLMs could support evidence-based decision-making among health care professionals for improved patient outcomes.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>Our study has 2 main objectives: first, to examine the potential of LLMs as a CME tool for diagnosing rare and complex conditions, and second, to highlight the impact of prompt formatting on the performance of LLMs. Understanding these aspects could significantly contribute to advancing diagnostic practices and effectively using LLMs to improve patient care.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Sets</title>
        <p>We used 2 data sets to examine the capacity of LLMs to diagnose rare and complex conditions as follows:</p>
        <list list-type="order">
          <list-item>
            <p>Diagnostic case challenge collection (DC3) [<xref ref-type="bibr" rid="ref11">11</xref>] comprises 30 complex diagnostic cases curated by medical experts in the <italic>New England Journal of Medicine</italic> web-based case challenges. The original cases contained text and image descriptions of patients’ medical history, diagnostic imaging, and laboratory results; however, we used only textual information to form prompts (queries). The web-based polls recorded an average of 5850 (SD 2522.84) respondents per case, many of whom were health care professionals. The participants were required to identify the correct diagnosis from a list of differential diagnoses. Case difficulty was categorized based on the percentage of correct responses received from the respondents on the web-based survey. The case categories were: “rarely misdiagnosed cases” (with ≥21/30, 70% correct responses), “moderately misdiagnosed cases” (with &#62;9/30, 30% and &#60;21/30, 70% correct responses), and “frequently misdiagnosed cases” (with ≤9/30, 30% correct responses). Furthermore, the final diagnoses determined by the treating physicians of the cases were provided alongside the poll results, enabling the comparison of the performance of human respondents with that of the targeted LLMs.</p>
          </list-item>
          <list-item>
            <p>Medical Information Mart for Intensive Care-III (MIMIC-III) [<xref ref-type="bibr" rid="ref23">23</xref>] comprises deidentified electronic health record data from approximately 50,000 Boston Beth Israel Deaconess Medical Center intensive care unit patients. We focused on discharge summaries containing the accumulated patient information from admission to discharge. Similar to previous work on clinical outcome prediction by van Aken et al [<xref ref-type="bibr" rid="ref24">24</xref>] and Abdullahi et al [<xref ref-type="bibr" rid="ref25">25</xref>], we filtered document sections unrelated to admissions, such as discharge information or hospital course and retained sections related to admissions, such as chief complaint, history of illness or present illness, medical history, admission medications, allergies, physical examination, family history, and social history. Each discharge summary had a discharge diagnosis section that indicated the patient’s final diagnosis for that admission. We reviewed the discharge summaries to identify rare diseases and referred to the Orphanet website [<xref ref-type="bibr" rid="ref26">26</xref>]. In this study, we randomly selected 15 unique, rare conditions as our target. These cases were selected as pilot studies for a focused and in-depth analysis.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Models</title>
        <p>In this study, we conducted experiments using LLMs designed for conversational context. Specifically, we used the July 6, 2023, version of Bard; the July 4, 2023, versions of GPT-4 and ChatGPT-3.5; and the publicly available version of MedAlpaca 7b [<xref ref-type="bibr" rid="ref22">22</xref>]. We entered prompts individually through the chat interface to evaluate Bard, GPT-4, and ChatGPT-3.5, treating each prompt as a distinct conversation. MedAlpaca differs from Bard, ChatGPT-3.5, and GPT-4 in that it requires users to submit queries or prompts through a Python (Python Software Foundation) script. Consequently, we used a single Python script for each prompt strategy to submit queries for each data set. It is worth noting that Bard has certain limitations compared with ChatGPT-3.5 and GPT-4. Bard has a restricted capacity to handle lengthy queries. Moreover, Bard is more sensitive to noisy input and specific characters. For example, the MIMIC-III data set contained deidentified patients’ notes filled with special characters such as “[**Hospital 18654**]” and laboratory results written in shorthand, for example, <italic>* Hgb-9.6* Hct-29.7* MCV-77* MCH-24.9*.</italic> Consequently, to work effectively with Bard, we preprocessed the text by removing special characters and retaining only alphanumeric characters.</p>
      </sec>
      <sec>
        <title>Prompting Strategies</title>
        <p>Direct (standard prompting) and iterative prompting (chain of thought prompting) [<xref ref-type="bibr" rid="ref27">27</xref>] are the 2 major prompting methods. Iterative prompting is a promising method for improving LLM performance on specialized tasks; however, it requires a predefined set of manually annotated reasoning steps, which can be time consuming and difficult to create, especially for specialized domains. Most users opt for a direct prompt method to save time and obtain an immediate response. Therefore, to analyze the effect of prompt formats on LLM performance, we assessed each model’s performance for every case using the 3 distinct direct prompt strategies outlined in <xref ref-type="table" rid="table1">Table 1</xref>. These strategies varied from open-ended to multiple-choice formats.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Prompt strategies.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="410"/>
            <col width="440"/>
            <thead>
              <tr valign="top">
                <td>Approach</td>
                <td>Prompt strategy description</td>
                <td>Prompt sample</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Approach 1 (open-ended prompt)</td>
                <td>In this approach, prompts were formatted in an open-ended fashion. Formatting a prompt using this method allows the model to formulate a hypothesis for the case and explain why and what it thinks is the diagnosis. Here, we scored a model based on its ability to provide the correct diagnosis without additional assistance.</td>
                <td>“What is the diagnosis? The case is: A 32-year-old man was evaluated in the emergency department of this hospital for the abrupt onset of postprandial chest pain...”</td>
              </tr>
              <tr valign="top">
                <td>Approach 2 (multiple-choice prompt)</td>
                <td>We formatted prompts as multiple-choice questions, and the LLMs<sup>a</sup> were expected to select a single diagnosis from a list of options. The models were assigned a positive score in this task if they selected the correct diagnosis from the options.</td>
                <td>“Choose the most likely diagnosis from the following: Option I: Cholecystitis, Option II: Acute coronary syndrome, Option III: Pericarditis, Option IV: Budd-Chiari syndrome. The case is: A 32-year-old man was evaluated in the emergency department of this hospital for the abrupt onset of postprandial chest pain...”</td>
              </tr>
              <tr valign="top">
                <td>Approach 3 (ranking prompt)</td>
                <td>The prompts were presented as a case and a list of diagnoses to be ranked by the LLMs. Models were assigned a positive score if the correct diagnosis was ranked first in this format.</td>
                <td>“Rank the following diagnoses according to the most likely. Option I: Cholecystitis, Option II: Acute coronary syndrome, Option III: Pericarditis, Option IV: Budd-Chiari syndrome. The case is: A 32-year-old man was evaluated in the emergency department of this hospital for the abrupt onset of postprandial chest pain...”</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>LLM: large language model.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Building upon prior research by Wang et al [<xref ref-type="bibr" rid="ref28">28</xref>] and Li et al [<xref ref-type="bibr" rid="ref29">29</xref>], we hypothesized that using a diverse range of prompts can reveal distinct reasoning paths while maintaining consistency in the correct responses regardless of the variations. When using multiple-choice prompts for the DC3 cases, we presented the same options available in the original web-based polls to the models, but on the MIMIC-III data set, we generated random wrong answers that were closely related to the correct diagnosis. We evaluated each LLM by assigning a positive or negative score (binary score) based on their responses. A positive score was assigned only if the models correctly selected the diagnosis for either data set. Conversely, we omitted the options for open-ended prompts, expecting the models to generate the correct diagnosis independently. Positive scores were awarded only if the models accurately provided the correct diagnosis.</p>
      </sec>
      <sec>
        <title>Prompt Ensemble: Majority Voting</title>
        <p>To safely use imperfect language models, users must determine when to trust their predictions, particularly in critical situations, such as clinical decision support. Therefore, we used a majority voting (prompt ensembling) strategy to enhance the reliability of LLMs’ responses. The majority voting approach involves aggregating multiple responses and selecting the most common answer. By applying this approach to responses generated by different LLMs, we can observe the level of agreement and infer the consistency in their outputs for a given prompt. Specifically, we hypothesized that using a majority voting approach from the ensemble of prompt responses would boost the reliability of language models, minimizing potential errors, variations, and biases associated with individual prompting approaches. To achieve this, in independent chats, we prompted the LLM with 3 distinct prompt formats per case, as presented in <xref ref-type="table" rid="table1">Table 1</xref>. Subsequently, we collected the responses of each model and applied majority voting to aggregate its predictions, as presented in <xref rid="figure1" ref-type="fig">Figure 1</xref>. In majority voting, each prompt produced a response from the language model, and the majority response was chosen as the final response. In a scenario where all prompt strategies resulted in different responses, we assumed that the model was unsure of that question and scored the final response as a failure case. We limited the number of prompts in the ensemble to 3 because studies by Wang et al [<xref ref-type="bibr" rid="ref28">28</xref>] and Li et al [<xref ref-type="bibr" rid="ref29">29</xref>] have shown that we obtain diminishing returns as we increase the overall number of prompts in an ensemble.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Our proposed method contains the following steps: (1) prompt a language model using a distinct set of prompts, (2) obtain diverse responses, and (3) choose the most consistent response as the final answer (majority voting).</p>
          </caption>
          <graphic xlink:href="mededu_v10i1e51391_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>No ethics approval was pursued for this research, given that the data was publicly accessible and deidentified. This aligns with the guidelines outlined in the National Institutes of Health investigator manual for human subjects research [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Performance Across Prompt Strategies</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> reveals the performance of LLMs across different prompts on the DC3 data set. Overall, approach 2 (multiple-choice prompt) yielded the highest score for all 30 cases, with GPT-4 and Bard achieving an accuracy score of 47% (14/30) and ChatGPT-3.5 obtaining a score of 43% (13/30). However, when considering case difficulty, the results varied. On the frequently misdiagnosed cases category, GPT-4 and ChatGPT-3.5 performed better with open-ended prompts (approach 1), scoring 30% (3/10) and 20% (2/10), respectively. In contrast, Bard demonstrated superior performance with multiple-choice prompts for selection and ranking (approaches 2 and 3), achieving a score of 30% (3/10). ChatGPT-3.5 and Bard performed equally well on the rarely misdiagnosed cases category using approaches 2 and 3, achieving a perfect score of 100% (2/2). Furthermore, GPT-4 attained a score of 100% (2/2) but only with approach 2. For the moderately misdiagnosed cases category, all LLMs achieved their best performance with approach 2, scoring 67% (12/18), 56% (10/18), and 50% (9/18) for GPT-4, ChatGPT-3.5, and Bard, respectively. Table S1 in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents the inconsistencies in the correct responses across the approaches for different cases. For example, Bard could only diagnose milk alkali syndrome using approach 1 but failed to use other prompt approaches. ChatGPT-3.5 correctly diagnosed primary adrenal insufficiency (Addison disease) with only approach 2, whereas GPT-4 was able to diagnose acute hepatitis E virus infection with only approach 1. These results indicate that no universal prompt approach is optimal for all LLMs when dealing with complex cases.</p>
        <p>Results on the MIMIC-III data set in <xref rid="figure3" ref-type="fig">Figure 3</xref> showed that the LLMs also performed best using approach 2 (multiple-choice prompt), with Bard and GPT-4 obtaining scores of 93% (14/15) each and ChatGPT-3.5 obtaining 73% (11/15). Using approach 3 (ranking prompt) resulted in a slight drop in performance for GPT-4 and Bard, with a 6% decrease, whereas the performance of ChatGPT-3.5 dropped by 26%. Approach 1 (open-ended prompt) proved challenging for the LLMs, with scores of 47% (7/15), 60% (9/15), and 27% (4/15) for Bard, GPT-4, and ChatGPT-3.5, respectively. Table S2 in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> illustrates that approach 1 was only beneficial to GPT-4 in diagnosing amyloidosis, whereas it was consistently never the sole correct approach for Bard and ChatGPT-3.5. These results aligned with the findings from the DC3 data set and emphasized the varying performances of different models and prompt approaches across tasks.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Results of the diagnostic case challenge collection data set comparing prompt strategies. OpenAI GPT-4 outperformed all other models, achieving the highest score in all 30 cases using the majority voting approach. Furthermore, all large language models except MedAlpaca outperformed the human consensus (denoted by a black dashed line) across all cases, regardless of the difficulty, using at least 1 prompt approach. GPT-4: generative pretrained transformer-4.</p>
          </caption>
          <graphic xlink:href="mededu_v10i1e51391_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Results of the Medical Information Mart for Intensive Care-III data set across prompt strategies. Approach 1 (open-ended prompt) proved challenging for all the large language models compared with approach 2 (multiple-choice prompt) and approach 3 (ranking prompt).</p>
          </caption>
          <graphic xlink:href="mededu_v10i1e51391_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Performance With Majority Voting</title>
        <p>Previous experiments have demonstrated that there is no perfect prompting strategy because LLM users may not know beforehand which prompt will produce a correct response. We used the majority voting approach to estimate consistency, maximize the benefits of different prompt strategies, and enhance the reliability of the LLMs’ responses. <xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates the results for all DC3 cases. Majority voting improved the overall performance of GPT-4 from 47% to 50%, whereas the performance of ChatGPT-3.5 remained at 43% because majority voting did not decrease its performance compared with that of approach 2. In contrast, the performance of Bard decreased from 47% to 43% compared with that of approach 2. Summarizing the overall performance based on query difficulty, majority voting resulted in a perfect score of 100% for the rarely misdiagnosed cases category across all the LLMs. For the frequently misdiagnosed cases category in DC3, Bard achieved the highest score with majority voting and multiple-choice prompts, whereas GPT-4 performed best for the moderately misdiagnosed cases category with majority voting and approach 2. In addition, GPT-4 outperformed all other LLMs across all DC3 cases using the majority voting approach, regardless of the case difficulty. This score surpassed the performance of the individual prompt approaches in all cases.</p>
        <p>Results on the MIMIC-III data set in <xref rid="figure3" ref-type="fig">Figure 3</xref> showed that, the scores with majority voting were 87% (13/15) for GPT-4 and Bard each and 53% (8/15) for ChatGPT-3.5. These results indicate that the ensemble method did not substantially improve their performance compared with their best individual approach. It is worth noting that although the majority voting approach did not consistently outperform individual approaches in terms of the highest number of correct responses, it did provide a means to consolidate predictions and mitigate potential errors and biases from single approaches.</p>
      </sec>
      <sec>
        <title>Comparison With Human Respondents</title>
        <p>In the DC3 cases, although the human respondents had the advantage of accessing supporting patient information such as image scans and magnetic resonance imaging, the LLMs consistently outperformed the average human consensus. As shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, using the majority voting approach, all LLMs achieved a higher performance than the human consensus (denoted by a black dashed line), with a minimum margin of 5% across all 30 cases. Specifically, when considering query difficulty, the LLMs demonstrated even greater superiority. In the rarely misdiagnosed cases category, all LLMs surpassed the average human consensus by a substantial margin of 26%. For the moderately misdiagnosed cases category, GPT-4 and ChatGPT-3.5 maintained their advantage over human respondents, achieving a minimum margin of 11% with the majority voting approach. In contrast, only Bard outperformed the human average consensus on the frequently misdiagnosed cases category, with a margin of 14%.</p>
        <p>We conducted a Spearman rank correlation test to analyze the pattern in the responses between each LLM and the human respondents. This involved correlating the average percentage of correct responses for each LLM across the prompt strategies with that of correct human responses. The results of the Spearman correlation test revealed that Bard had a relatively weak correlation coefficient of 0.30, whereas GPT-4 and ChatGPT-3.5 exhibited moderate positive correlations of 0.51 and 0.50, respectively. This suggested that the diagnostic performance patterns of GPT-4 and ChatGPT-3.5 aligned moderately with those of the human respondents. The observed correlation in answering patterns between human respondents and LLMs may stem from the inherent data bias present in the training data sets. The LLMs learn from vast amounts of data, and if the training data are biased toward certain diagnostic or decision-making patterns commonly expressed by human physicians, the model is likely to replicate those patterns. Although the correlation suggested that the LLMs have the potential to be valuable tools in medical education, it is important to note their correlation with human physicians and that the performance of LLMs does not necessarily mean that they are as good as human physicians in diagnosing and treating diseases.</p>
        <p>We could not directly compare the performance of human respondents on the MIMIC-III data sets because of the unavailability of data. Overall, the results indicated that the LLMs consistently outperformed the average human consensus in diagnosing medical cases, showcasing their potential as a tool to complement and enhance care quality and education for complex diagnostic cases.</p>
      </sec>
      <sec>
        <title>Comparison With MedAlpaca</title>
        <p>On the DC3 data sets, Bard, GPT-4, and ChatGPT-3.5 outperformed MedAlpaca across all cases using the majority voting approach by a minimum margin of 13%. MedAlpaca also displayed the worst performance in the open-ended prompts, irrespective of query difficulty. However, when multiple-choice options were provided, MedAlpaca outperformed the other LLMs in the frequently misdiagnosed cases category. Similar to the DC3 data set, MedAlpaca consistently demonstrated its best performance using the ranking prompt on the MIMIC-III data sets. However, its overall performance was significantly poorer than the other LLMs, with each LLM outperforming the model by at least 26% using the majority voting approach. In contrast to the general-purpose LLMs (eg, Bard, GPT-4, and ChatGPT-3.5), investigating the MedAlpaca model was finetuned using diverse medical tasks and assessed using multiple-choice medical examinations. This tailored training approach likely contributed to its notable performance, particularly excelling in DC3 cases (frequently misdiagnosed instances) and demonstrating optimal results in multiple-choice queries.</p>
      </sec>
      <sec>
        <title>Qualitative Analysis</title>
        <p>In our experiments, we manually observed the responses of each LLM to all our prompts and noted that each LLM consistently justified its diagnosis choice except for MedAlpaca. Specifically, each LLM offered a logical explanation for its chosen response regardless of the prompting strategy. For further investigation, we analyzed each LLM’s responses in 3 scenarios: (1) when presented with multiple-choice options containing the true diagnosis and they responded accurately, (2) when their response was incorrect, and (3) when given only incorrect multiple-choice options to pick from. In the first scenario, as presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, all LLMs (eg, Bard, GPT-4, and ChatGPT-3.5) mentioned that their rationale for diagnosing <italic>miliary tuberculosis</italic> was owing to relevant symptoms presented in the case, such as a <italic>history of respiratory illness and the presence of mesenteric lymph nodes and numerous tiny nodules throughout both lungs distributed in a miliary pattern.</italic> This pattern of offering insightful reasons for the likelihood of a diagnosis and explaining why other diagnostic options are less probable is valuable for educational purposes. In the second scenario, we observed that there was a notable disparity in the accuracy of human respondents. Only 6% (217/3624) of the human participants provided the correct response, with most votes (1232/3624, 34%) favoring <italic>ulcerative colitis</italic>, whereas 23% (833/3624) of the human responses opted for <italic>salmonellosis</italic>. Notably, Bard and GPT-4 displayed similar behavior by selecting salmonellosis, whereas ChatGPT-3.5 and MedAlpaca chose <italic>ulcerative colitis</italic>.</p>
        <p>Another notable finding occurred in the responses of GPT-4 and ChatGPT-3.5. Regardless of the correctness of their chosen diagnoses, these models consistently recommended further tests to confirm their responses. This behavior suggested a general tendency toward advocating additional examinations to validate their diagnoses, potentially reflecting a cautious approach. In contrast, Bard adopted a different approach. Instead of recommending further tests, Bard highlighted that the provided query information supported the diagnosis without suggesting additional confirmatory measures. In the scenario where only incorrect options were given, Bard, ChatGPT-3.5, and MedAlpaca made choices and justified their responses. In contrast, GPT-4 explicitly mentioned that none of the provided options matched the case presentation. Furthermore, GPT-4 suggested a more probable diagnosis and recommended additional testing to explore its feasibility.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Previous studies [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>] have presented the impressive success of LLMs in standardized medical examinations. We conducted experiments to assess the potential of LLMs as a CME system for rare and complex diagnoses, and our findings demonstrated that LLMs have the potential to be a valuable tool for rare disease education and differential diagnosis. Although LLMs demonstrated superior performance compared with the average human consensus in diagnosing complex diseases, it is essential to note that this does not imply their superiority over physicians. Numerous unknown factors, including the level of respondents’ expertise, may influence the outcome of web-based polls. Furthermore, we examined the knowledge capacity of LLMs through open-ended and multiple-choice prompts and found that LLMs, including MedAlpaca, performed better with multiple-choice prompts. This improvement can be attributed to the options provided, which narrowed the search space for potential diagnoses from thousands to a few likely possibilities. Consequently, we surmise that LLMs are not yet ready to be used as stand-alone tools, which aligns with the findings of previous studies [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Our observations revealed the consistent outperformance of general-purpose LLMs over MedAlpaca in various experiments. Their superior ability to provide valuable justifications for making diagnoses was particularly noteworthy, a strength not matched by MedAlpaca. This difference may stem from MedAlpaca’s exclusive finetuning and assessment for multiple-choice medical examinations, which slightly differ in format from the clinical cases in our experiments.</p>
        <p>A notable finding in the response of LLMs to queries was their consistent provision of coherent and reasoned explanations, regardless of the query format. For instance, when diagnosing <italic>miliary tuberculosis</italic>, all 3 LLMs emphasized that the patient’s systemic symptoms, exposure risks, chest radiograph, computed tomography scan findings, and the suspected compromised immune state collectively support the diagnosis of <italic>miliary tuberculosis.</italic> Furthermore, Bard and GPT-4 ruled out other diagnoses presented in the multiple-choice prompt by highlighting their less typical presentations and lack of certain associated symptoms or risk factors. In addition, the conversational nature of LLMs allows users to ask follow-up questions for further context. These attributes hold great potential for educating users and offering them insights. However, we observed that LLMs provided logical explanations, even when their diagnoses were incorrect. ChatGPT-3.5 and GPT-4 may suggest additional testing to validate their selected diagnosis or use cautious terms like “potential diagnosis.” However, it remains unclear whether these recommendations stem from the models’ internal confidence or whether there are features intentionally designed by the developers for cautious use. The absence of explicit information regarding the level of uncertainty of LLMs for a specific case is concerning as it could potentially mislead clinicians. The ability to quantify uncertainty is crucial in medical decision-making, in which accurate diagnoses and treatment recommendations are paramount. Clinicians heavily rely on confidence levels and probability assessments to make informed judgments [<xref ref-type="bibr" rid="ref29">29</xref>]. Without an indication of uncertainty, there is a risk that clinicians may trust the logical explanations provided by the LLMs even when they are incorrect, leading to misdiagnoses or inappropriate treatment plans.</p>
        <p>Considering the delicate role of clinical decision support, it is essential to address validity and reliability as crucial aspects of uncertainty. Moreover, a reliable system is of paramount importance for medical education. However, the stochastic nature of LLMs introduces doubts among clinicians regarding their reliability. Although a specific metric to quantitatively assess the reliability of the LLMs used in this study is currently lacking, we acknowledge the significance of consistency in achieving reliability. To address this, we used different prompting strategies and implemented a majority voting approach to select the most consistent response from each LLM. After examining the individual prompt strategies, we anticipated consistent responses across strategies for a specific case. However, our findings revealed that the responses of LLMs were sensitive to concrete prompt formats, particularly in complex diagnoses. For instance, ChatGPT-3.5 and GPT-4 performed better with the open-ended prompt (approach 1) in the frequently misdiagnosed cases category of DC3 cases but struggled with similar cases using multiple-choice and ranking prompts (approaches 2 and 3). In contrast, Bard performed better with multiple-choice prompts. These results highlighted that there is no one-size-fits-all prompting approach nor does a single strategy apply universally to all LLMs. Although the majority voting strategy did not yield optimal results for all models across data sets, it served as a means to consolidate responses from multiple prompts and provided a starting point for incorporating reliability.</p>
        <p>Several studies [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>] have emphasized the significance of enhancing the education of clinicians at all levels to provide better support for rare and complex diagnoses. In this pursuit, the studies by Lee et al [<xref ref-type="bibr" rid="ref8">8</xref>] and Decherchi et al [<xref ref-type="bibr" rid="ref31">31</xref>] have highlighted the potential advantages of artificial intelligence (AI) systems, whereas the studies by Abdullahi et al [<xref ref-type="bibr" rid="ref25">25</xref>] and Sutton et al [<xref ref-type="bibr" rid="ref32">32</xref>] have reported a lack of acceptance of AI tools among clinicians. For instance, younger medical students and residents appeared more receptive to integrating technology [<xref ref-type="bibr" rid="ref33">33</xref>]. One notable reason for this lack of acceptance is that conventional AI systems typically require training before clinicians can effectively use them, which can be burdensome and time consuming [<xref ref-type="bibr" rid="ref32">32</xref>]. In contrast, conversational LLMs, such as ChatGPT-3.5, Bard, and GPT-4, offer a distinct advantage with their simple interface and dialogue-based nature. These conversational LLMs eliminate the need for extensive training, increasing their potential for high acceptance across all levels of medical practice. Although the exciting ease of use, conversational nature, impressive display of knowledge, and logical explanations of LLMs have the potential for user education and insights, their current limitations in reliability and expressing uncertainty must be addressed to ensure their effective and responsible use in critical domains, such as health care.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>First, the limitations of the knowledge of ChatGPT-3.5 and GPT-4 to the latest trends and updates in health care (or medical) data till 2021 pose the risk of potentially incomplete information and hamper the effectiveness of the models as a CME tool, especially when addressing emerging diseases. In contrast, although continuous updates to Bard are advantageous for keeping the model up-to-date, this attribute may impact the reproducibility of our study. Second, it is notable that our experiments had a limited scope owing to a small sample size consisting of only 30 diseases from the DC3 data set and 15 cases from the MIMIC-III data set. In addition, although we took precautions to preprocess the MIMIC-III notes to prevent leakage of the final diagnosis, the discharge summaries may still contain nuanced information that could make the diagnosis obvious. Furthermore, the closed nature of the LLMs used in this study restricted our technique for measuring reliability to a majority voting approach, which consolidated responses from diverse prompts. Although majority voting can help to mitigate the variability of LLM output, it is notable that LLMs may still generate different responses for the same prompt. This variability should be considered when interpreting the results of this study. However, when these LLMs are released with an enhanced iteration that allows for finetuning and calibration, future work should incorporate more effective mechanisms to estimate and communicate uncertainty. An example of such an approach could involve assigning a confidence score to the probability score of their responses. This methodology could allow clinicians to make informed decisions regarding whether to accept or reject responses that fall within a desired threshold.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we conducted experiments to assess the potential of LLMs, including ChatGPT-3.5, GPT-4, and Bard, as a CME system for rare and complex diagnoses. First, we evaluated their diagnostic capability specifically for rare and complex cases. Subsequently, we explored the impact of prompt formatting on their performance. Our results revealed that these LLMs possessed potential diagnostic capacities for rare and complex medical cases, surpassing the average crowd consensus on the DC3 cases. For selected rare cases from the MIMIC-III data set, Bard and GPT-4 achieved a diagnostic accuracy of 93%, whereas ChatGPT-3.5 achieved an accuracy of 73%. Our findings highlighted that users might discover an approach that yields favorable results for various queries by exploring different prompt formats. In contrast, using majority voting of responses from multiple prompt strategies offers the benefit of a robust and reliable model, instilling confidence in the generated responses. However, determining the best prompt strategy versus relying on the majority voting approach involves a tradeoff between exploration and exploitation. Although prompt engineering research is continuing, we hope that future studies will yield better solutions to enhance the reliability and consistency of the responses of LLMs. Overall, our study’s results and conclusions provide a benchmark for the performance of LLMs and shed light on their strengths and limitations in generating responses, expressing uncertainty, and providing diagnostic recommendations. The insights gained from this study can serve as a foundation for further exploration and research on using LLMs as medical education tools to enhance their performance and capabilities as conversational language models.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Comprehensive tables detailing the performance of each model across data sets, with included examples of prompts and responses for each model.</p>
        <media xlink:href="mededu_v10i1e51391_app1.docx" xlink:title="DOCX File , 46 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CME</term>
          <def>
            <p>continuing medical education</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DC3</term>
          <def>
            <p>diagnostic case challenge collection</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">MIMIC-III</term>
          <def>
            <p>Medical Information Mart for Intensive Care-III</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We acknowledge support from the Open Access Publication Fund of the University of Tübingen.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The URLs for the diagnostic case challenge collection data set can be obtained via A Diagnostic Case Challenge Collection [<xref ref-type="bibr" rid="ref34">34</xref>]. The Medical Information Mart for Intensive Care data sets can be accessed via the database, Medical Information Mart for Intensive Care-III Clinical Database v1.4 [<xref ref-type="bibr" rid="ref35">35</xref>], after obtaining permission from Physionet.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing ChatGPT</article-title>
          <source>OpenAI</source>
          <access-date>2023-03-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/chatgpt/">https://openai.com/blog/chatgpt/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manyika</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hsiao</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>An overview of Bard: an early experiment with generative AI</article-title>
          <source>Google</source>
          <access-date>2024-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ai.google/static/documents/google-about-bard.pdf">https://ai.google/static/documents/google-about-bard.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Achiam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Akkaya</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Aleman</surname>
              <given-names>FL</given-names>
            </name>
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Altenschmidt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anadkat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Avila</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Babuschkin</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Balaji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Balcom</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Baltescu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bavarian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Belgum</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bello</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Berdine</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bernadett-Shapiro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Berner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bogdonoff</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Boiko</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brakman</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Brockman</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Brooks</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Brundage</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Button</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cann</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Carey</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Carlson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Carmichael</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chantzis</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chess</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Cummings</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Currier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Decareaux</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Degry</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Deutsch</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Deville</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dhar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dohan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dowling</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dunning</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ecoffet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Eleti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Eloundou</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Farhi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Fedus</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Felix</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fishman</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Forte</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fulford</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Georges</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Goel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Gogineni</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gontijo-Lopes</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gordon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grafstein</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Greene</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gross</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hallacy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Heaton</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Heidecke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hesse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hickey</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hickey</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hoeschele</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Houghton</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huizinga</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online March 15, 2023. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2303.08774"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Resnick</surname>
              <given-names>DK</given-names>
            </name>
          </person-group>
          <article-title>Commentary: performance of ChatGPT, GPT-4, and Google Bard on a neurosurgery oral boards preparation question bank</article-title>
          <source>Neurosurgery</source>
          <year>2023</year>
          <month>07</month>
          <day>19</day>
          <comment>(forthcoming)</comment>
          <pub-id pub-id-type="doi">10.1227/neu.0000000000002618</pub-id>
          <pub-id pub-id-type="medline">37466324</pub-id>
          <pub-id pub-id-type="pii">00006123-990000000-00814</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Markov</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>AT</given-names>
            </name>
          </person-group>
          <article-title>Comparing scientific abstracts generated by ChatGPT to real abstracts with detectors and blinded human reviewers</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <month>04</month>
          <day>26</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>75</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00819-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00819-6</pub-id>
          <pub-id pub-id-type="medline">37100871</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00819-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC10133283</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dave</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Athaluri</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations</article-title>
          <source>Front Artif Intell</source>
          <year>2023</year>
          <month>5</month>
          <day>4</day>
          <volume>6</volume>
          <fpage>1169595</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37215063"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/frai.2023.1169595</pub-id>
          <pub-id pub-id-type="medline">37215063</pub-id>
          <pub-id pub-id-type="pmcid">PMC10192861</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Petro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <month>03</month>
          <day>30</day>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1233</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmsr2214184</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: the future of discharge summaries?</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>e107</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1016/s2589-7500(23)00021-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mitani</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Haneuse</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Small data challenges of studying rare diseases</article-title>
          <source>JAMA Netw Open</source>
          <year>2020</year>
          <month>03</month>
          <day>02</day>
          <volume>3</volume>
          <issue>3</issue>
          <fpage>e201965</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/journals/jamanetworkopen/fullarticle/10.1001/jamanetworkopen.2020.1965"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.1965</pub-id>
          <pub-id pub-id-type="medline">32202640</pub-id>
          <pub-id pub-id-type="pii">2763223</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eickhoff</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gmehlin</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Boullier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fraser</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>DC3 -- a diagnostic case challenge collection for clinical decision support</article-title>
          <source>Proceedings of the 2019 ACM SIGIR International Conference on Theory of Information Retrieval</source>
          <year>2019</year>
          <conf-name>ICTIR '19</conf-name>
          <conf-date>October 2-5, 2019</conf-date>
          <conf-loc>Santa Clara, CA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3341981.3344239</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Walkowiak</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Domaradzki</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Are rare diseases overlooked by medical education? Awareness of rare diseases among physicians in Poland: an explanatory study</article-title>
          <source>Orphanet J Rare Dis</source>
          <year>2021</year>
          <month>09</month>
          <day>28</day>
          <volume>16</volume>
          <issue>1</issue>
          <fpage>400</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ojrd.biomedcentral.com/articles/10.1186/s13023-021-02023-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13023-021-02023-9</pub-id>
          <pub-id pub-id-type="medline">34583737</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13023-021-02023-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC8479904</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sartorious</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Comorbidity of mental and physical diseases: a main challenge for medicine of the 21st century</article-title>
          <source>Shanghai Arch Psychiatry</source>
          <year>2013</year>
          <month>04</month>
          <volume>25</volume>
          <issue>2</issue>
          <fpage>68</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24991137"/>
          </comment>
          <pub-id pub-id-type="doi">10.3969/j.issn.1002-0829.2013.02.002</pub-id>
          <pub-id pub-id-type="medline">24991137</pub-id>
          <pub-id pub-id-type="pii">sap-25-02-068</pub-id>
          <pub-id pub-id-type="pmcid">PMC4054544</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bateman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bested</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Bonilla</surname>
              <given-names>HF</given-names>
            </name>
            <name name-style="western">
              <surname>Chheda</surname>
              <given-names>BV</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Curtin</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Dempsey</surname>
              <given-names>TT</given-names>
            </name>
            <name name-style="western">
              <surname>Dimmock</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Dowell</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Felsenstein</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Klimas</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Komaroff</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Lapp</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Levine</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Montoya</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Natelson</surname>
              <given-names>BH</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Podell</surname>
              <given-names>RN</given-names>
            </name>
            <name name-style="western">
              <surname>Rey</surname>
              <given-names>IR</given-names>
            </name>
            <name name-style="western">
              <surname>Ruhoy</surname>
              <given-names>IS</given-names>
            </name>
            <name name-style="western">
              <surname>Vera-Nunez</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Yellman</surname>
              <given-names>BP</given-names>
            </name>
          </person-group>
          <article-title>Myalgic encephalomyelitis/chronic fatigue syndrome: essentials of diagnosis and management</article-title>
          <source>Mayo Clin Proc</source>
          <year>2021</year>
          <month>11</month>
          <volume>96</volume>
          <issue>11</issue>
          <fpage>2861</fpage>
          <lpage>78</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0025-6196(21)00513-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.mayocp.2021.07.004</pub-id>
          <pub-id pub-id-type="medline">34454716</pub-id>
          <pub-id pub-id-type="pii">S0025-6196(21)00513-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Faviez</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Garcelon</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Neuraz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Knebelmann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Salomon</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lyonnet</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Saunier</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Burgun</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Diagnosis support systems for rare diseases: a scoping review</article-title>
          <source>Orphanet J Rare Dis</source>
          <year>2020</year>
          <month>04</month>
          <day>16</day>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>94</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ojrd.biomedcentral.com/articles/10.1186/s13023-020-01374-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13023-020-01374-z</pub-id>
          <pub-id pub-id-type="medline">32299466</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13023-020-01374-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC7164220</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mbakwe</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Lourentzou</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mechanic</surname>
              <given-names>OJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dagan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT passing USMLE shines a spotlight on the flaws of medical education</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000205</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812618"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000205</pub-id>
          <pub-id pub-id-type="medline">36812618</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-23-00027</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931307</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Patterson</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Wanderer</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Turer</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>McCoy</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Sittig</surname>
              <given-names>DF</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Using AI-generated suggestions from ChatGPT to optimize clinical decision support</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2023</year>
          <month>06</month>
          <day>20</day>
          <volume>30</volume>
          <issue>7</issue>
          <fpage>1237</fpage>
          <lpage>45</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37087108"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocad072</pub-id>
          <pub-id pub-id-type="medline">37087108</pub-id>
          <pub-id pub-id-type="pii">7136722</pub-id>
          <pub-id pub-id-type="pmcid">PMC10280357</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cascella</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Montomoli</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bellini</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Bignami</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the feasibility of ChatGPT in healthcare: an analysis of multiple clinical and research scenarios</article-title>
          <source>J Med Syst</source>
          <year>2023</year>
          <month>03</month>
          <day>04</day>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>33</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36869927"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10916-023-01925-4</pub-id>
          <pub-id pub-id-type="medline">36869927</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-023-01925-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC9985086</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Eisner</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Learning how to ask: querying LMs with mixtures of soft prompts</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online April 14, 2021. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.06599"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.410</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Si</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd-Graber</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Prompting GPT-3 to be reliable</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online October 17, 2022. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2210.09150"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Muresanu</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Paster</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pitis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Large language models are human-level prompt engineers</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online November 3, 2022. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2211.01910"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Papaioannou</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Grundmann</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Oberhauser,</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Löser</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bressem</surname>
              <given-names>KK</given-names>
            </name>
          </person-group>
          <article-title>MedAlpaca -- an open-source collection of medical conversational AI models and training data</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online April 14, 2023. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2304.08247"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van Aken</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Papaioannou</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Mayrdorfer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Budde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gers</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Loeser</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Clinical outcome prediction from admission notes using self-supervised knowledge integration</article-title>
          <source>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</source>
          <year>2021</year>
          <conf-name>16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</conf-name>
          <conf-date>April 21-23, 2021</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2021.eacl-main.75</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdullahi</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Mercurio</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Eickhoff</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-based diagnostic decision support</article-title>
          <source>JMIR Preprints</source>
          <comment>Preprint posted online June 25, 2023. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://preprints.jmir.org/preprint/50209"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/preprints.50209</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <article-title>Orphanet: about rare diseases</article-title>
          <source>Orphanet</source>
          <access-date>2023-07-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.orpha.net/consor/cgi-bin/Education_AboutRareDiseases.php?lng=EN">https://www.orpha.net/consor/cgi-bin/Education_AboutRareDiseases.php?lng=EN</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bosma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ichter</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online January 28, 2022. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2201.11903"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Narang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Self-consistency improves chain of thought reasoning in language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online March 21, 2022. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2203.11171"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lou</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Making language models better reasoners with step-aware verifier</article-title>
          <source>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>
          <year>2023</year>
          <conf-name>61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name>
          <conf-date>July 9-14, 2023</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2023.acl-long.291</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <article-title>NIH investigator manual for human subjects research</article-title>
          <source>Office of Intramural Research. Office of Human Subjects Research Protections</source>
          <access-date>2024-01-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ohsrp.nih.gov/confluence/display/ohsrp/Chapter+1+-+Types+of+Research_+Human+Subjects+Research+Vs.+Not+Human+Subjects+Research">https://ohsrp.nih.gov/confluence/display/ohsrp/Chapter+1+-+Types+of+Research_+Human+Subjects+Research+Vs.+Not+Human+Subjects+Research</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Decherchi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pedrini</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mordenti</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cavalli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sangiorgi</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and challenges for machine learning in rare diseases</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2021</year>
          <month>10</month>
          <day>5</day>
          <volume>8</volume>
          <fpage>747612</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34676229"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2021.747612</pub-id>
          <pub-id pub-id-type="medline">34676229</pub-id>
          <pub-id pub-id-type="pmcid">PMC8523988</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutton</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Pincock</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Baumgart</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Sadowski</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Fedorak</surname>
              <given-names>RN</given-names>
            </name>
            <name name-style="western">
              <surname>Kroeker</surname>
              <given-names>KI</given-names>
            </name>
          </person-group>
          <article-title>An overview of clinical decision support systems: benefits, risks, and strategies for success</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <month>02</month>
          <day>06</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>17</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-0221-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0221-y</pub-id>
          <pub-id pub-id-type="medline">32047862</pub-id>
          <pub-id pub-id-type="pii">221</pub-id>
          <pub-id pub-id-type="pmcid">PMC7005290</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eckleberry-Hunt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lick</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hunt</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Is medical education ready for generation Z?</article-title>
          <source>J Grad Med Educ</source>
          <year>2018</year>
          <month>08</month>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>378</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30154963"/>
          </comment>
          <pub-id pub-id-type="doi">10.4300/JGME-D-18-00466.1</pub-id>
          <pub-id pub-id-type="medline">30154963</pub-id>
          <pub-id pub-id-type="pii">Customer: JGME-D-18-00466</pub-id>
          <pub-id pub-id-type="pmcid">PMC6108364</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <article-title>codiag-public / dc3</article-title>
          <source>GitHub</source>
          <access-date>2024-01-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/codiag-public/dc3/blob/master/cases.url">https://github.com/codiag-public/dc3/blob/master/cases.url</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III clinical database (version 1.4)</article-title>
          <source>PhysioNet</source>
          <year>2016</year>
          <access-date>2024-01-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet.org/content/mimiciii/1.4/">https://physionet.org/content/mimiciii/1.4/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
