<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JME</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id>
      <journal-title>JMIR Medical Education</journal-title>
      <issn pub-type="epub">2369-3762</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e48978</article-id>
      <article-id pub-id-type="pmid">37548997</article-id>
      <article-id pub-id-type="doi">10.2196/48978</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of ChatGPT on the Situational Judgement Test—A Professional Dilemmas–Based Examination for Doctors in the United Kingdom</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Venkatesh</surname>
            <given-names>Kaushik</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Kamel Boulos</surname>
            <given-names>Maged N.</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Cheng</surname>
            <given-names>Yih-Dih</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gupta</surname>
            <given-names>Rohan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Harada</surname>
            <given-names>Yukinori</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Borchert</surname>
            <given-names>Robin J</given-names>
          </name>
          <degrees>BSc, MBChB, MPhil</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Radiology</institution>
            <institution>University of Cambridge</institution>
            <addr-line>Hills Road</addr-line>
            <addr-line>Cambridge, CB2 0QQ</addr-line>
            <country>United Kingdom</country>
            <phone>1 1223 805000</phone>
            <email>rb729@medschl.cam.ac.uk</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4673-9746</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hickman</surname>
            <given-names>Charlotte R</given-names>
          </name>
          <degrees>BMedSci, MBChB</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3228-1479</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Pepys</surname>
            <given-names>Jack</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1441-0145</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Sadler</surname>
            <given-names>Timothy J</given-names>
          </name>
          <degrees>MBBCHIR, MA, MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3710-3137</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Radiology</institution>
        <institution>University of Cambridge</institution>
        <addr-line>Cambridge</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Radiology</institution>
        <institution>Addenbrooke's Hospital</institution>
        <institution>Cambridge University Hospitals NHS Foundation Trust</institution>
        <addr-line>Cambridge</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of General Medicine</institution>
        <institution>Lister Hospital</institution>
        <institution>East and North Hertfordshire NHS Trust</institution>
        <addr-line>Stevenage</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Biomedical Sciences</institution>
        <institution>Humanitas University</institution>
        <addr-line>Milan</addr-line>
        <country>Italy</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Robin J Borchert <email>rb729@medschl.cam.ac.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>7</day>
        <month>8</month>
        <year>2023</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e48978</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>30</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>7</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Robin J Borchert, Charlotte R Hickman, Jack Pepys, Timothy J Sadler. Originally published in JMIR Medical Education (https://mededu.jmir.org), 07.08.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on https://mededu.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mededu.jmir.org/2023/1/e48978" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>ChatGPT is a large language model that has performed well on professional examinations in the fields of medicine, law, and business. However, it is unclear how ChatGPT would perform on an examination assessing professionalism and situational judgement for doctors.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We evaluated the performance of ChatGPT on the Situational Judgement Test (SJT): a national examination taken by all final-year medical students in the United Kingdom. This examination is designed to assess attributes such as communication, teamwork, patient safety, prioritization skills, professionalism, and ethics.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>All questions from the UK Foundation Programme Office’s (UKFPO’s) 2023 SJT practice examination were inputted into ChatGPT. For each question, ChatGPT’s answers and rationales were recorded and assessed on the basis of the official UK Foundation Programme Office scoring template. Questions were categorized into domains of Good Medical Practice on the basis of the domains referenced in the rationales provided in the scoring sheet. Questions without clear domain links were screened by reviewers and assigned one or multiple domains. ChatGPT's overall performance, as well as its performance across the domains of Good Medical Practice, was evaluated.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Overall, ChatGPT performed well, scoring 76% on the SJT but scoring full marks on only a few questions (9%), which may reflect possible flaws in ChatGPT’s situational judgement or inconsistencies in the reasoning across questions (or both) in the examination itself. ChatGPT demonstrated consistent performance across the 4 outlined domains in Good Medical Practice for doctors.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Further research is needed to understand the potential applications of large language models, such as ChatGPT, in medical education for standardizing questions and providing consistent rationales for examinations assessing professionalism and ethics.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>language models</kwd>
        <kwd>Situational Judgement Test</kwd>
        <kwd>medical education</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>language model</kwd>
        <kwd>exam</kwd>
        <kwd>examination</kwd>
        <kwd>SJT</kwd>
        <kwd>judgement</kwd>
        <kwd>reasoning</kwd>
        <kwd>communication</kwd>
        <kwd>chatbot</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>ChatGPT is a large language model developed by OpenAI, which uses deep learning to provide responses to natural language input, by identifying the relationships between words and by generating coherent responses [<xref ref-type="bibr" rid="ref1">1</xref>]. It achieves this in a conversational context following text input and produces an immediate response in an accessible format to users.</p>
      <p>These recent advances in language models demonstrate the potentially significant impact of artificial intelligence (AI) technologies on digital health. ChatGPT has already demonstrated its ability to pass professional examinations for postgraduates in the fields of law [<xref ref-type="bibr" rid="ref2">2</xref>] and business [<xref ref-type="bibr" rid="ref3">3</xref>]. ChatGPT showed similar promise in the field of medicine [<xref ref-type="bibr" rid="ref4">4</xref>], and its performance has been assessed on UK-based examinations for medical school admissions [<xref ref-type="bibr" rid="ref5">5</xref>], as well as those for general practitioners (GPs) [<xref ref-type="bibr" rid="ref6">6</xref>] and neurologists in training [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <p>With regard to the United States Medical Licensing Examination (USMLE), ChatGPT scored at, or near, the pass mark for each step of the examination [<xref ref-type="bibr" rid="ref4">4</xref>]. Although ChatGPT’s performance has been impressive, the USMLE focuses predominantly on basic science, pharmacology, and pathophysiology (step 1) as well as clinical reasoning and medical management (step 2CK), with less emphasis on other professional skills for becoming a successful doctor [<xref ref-type="bibr" rid="ref8">8</xref>]. Mbakwe et al [<xref ref-type="bibr" rid="ref8">8</xref>] argue that ChatGPT’s impressive performance on the USMLE emphasizes the need to develop more relevant approaches to evaluating these crucial skills, which are necessary for doctors but are not assessed in the USMLE. These additional skills are also not assessed in UK-based examinations for which ChatGPT’s performance has already been evaluated, such as the BioMedical Admissions Test [<xref ref-type="bibr" rid="ref5">5</xref>], the UK Neurology Specialty Certificate Examination [<xref ref-type="bibr" rid="ref7">7</xref>], and the Applied Knowledge test for GPs [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      <p>The Situational Judgement Test (SJT) aims to assess many of the skills not covered in the USMLE [<xref ref-type="bibr" rid="ref4">4</xref>] and in other examinations, which have been assessed using ChatGPT, including communication, teamwork, patient safety, prioritization skills, professionalism, and ethics. At the end of their university studies, all final-year medical students in the United Kingdom applying for Foundation Programme posts (similar to internships in the United States) take the SJT. A candidate’s performance on the SJT accounts for 50% of the overall score for their application to the Foundation Programme, while the other half is calculated from their educational performance in medical school. Later on in their training, many UK doctors are also required to take the Multi-Specialty Recruitment Assessment postgraduate examination that includes a professional dilemmas section similar to those in the SJT. The SJT places emphasis on 4 domains: Knowledge, Skills and Performance; Safety and Quality; Communication, Partnership and Teamwork; and Maintaining Trust - outlined in the General Medical Council’s (GMC) Good Medical Practice [<xref ref-type="bibr" rid="ref9">9</xref>]. This document lists the essential duties of all doctors working in the United Kingdom. Although performance on the SJT plays a significant role in determining the career path of UK doctors, several reports and student commentaries have suggested that there are significant discrepancies in the correct answers chosen among different experts [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      <p>Our aim was to evaluate the performance of ChatGPT on the SJT and determine how well it performs across the 4 key domains of Good Medical Practice. To our knowledge, this is the first study to investigate the performance of ChatGPT on a situational judgement and professionalism examination of this type.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>ChatGPT</title>
        <p>The ChatGPT model was trained on vast amounts of data from the internet, up to and including 2021, after which it has not been connected to the internet [<xref ref-type="bibr" rid="ref14">14</xref>]. Hence, ChatGPT has not been trained on data sets that only became available on the internet from 2022 onward, but it has demonstrated good performance on a range of natural language tasks such as question-answering and text summarization tasks [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      </sec>
      <sec>
        <title>SJT Examination</title>
        <p>The SJT examination is divided into 3 sections, with each question stem first introducing a scenario, followed by a question on how the candidate would approach the situation. These sections include (1) rating the appropriateness or importance of a response, action, or consideration; for example, very appropriate, appropriate, somewhat inappropriate, or inappropriate; (2) multiple-choice questions asking for the 3 most appropriate options from among 8 options; and (3) ranking the appropriateness, or importance, of 5 different actions or considerations in response to a scenario. The SJT is scored on a scale from 0 to 50 points and is not a pass-or-fail examination.</p>
        <p>Given the discrepancies in correct answers, and justifications among unofficial study resources, we used the most recent official 2023 SJT practice paper, which is publicly available from the official United Kingdom Foundation Programme Office (UKFPO) website [<xref ref-type="bibr" rid="ref16">16</xref>], together with a separate document with answers and rationales. This paper would, therefore, not have been available in the training set for ChatGPT as it was released after 2021.</p>
      </sec>
      <sec>
        <title>Encoding</title>
        <p>Each question from the 2023 SJT practice paper was formatted identically into the ChatGPT text with the following additions: (1) the official candidate examination instructions were provided before each scenario (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), and (2) ChatGPT was asked to provide its rationale at the end of each question (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). A new ChatGPT chat session was started for each question and, therefore, the instructions were written in the singular form to reflect that the model was being asked to answer each question separately to reduce the risk of memory retention bias.</p>
      </sec>
      <sec>
        <title>Assessing Performance</title>
        <p>We used the official UKFPO scoring templates to determine the number of marks scored by ChatGPT in each of the 3 sections of the examination. The scoring for each question is not binary, and partial marks are awarded for answers that are nearly correct. For example, in the multiple-choice section, each question has 3 correct answers from a choice of 8 options; each correct answer is awarded 4 points with a maximum of 12 points per question. Therefore, a candidate can score 0, 4, 8, or 12 marks for each multiple-choice question. The rating and ranking sections award partial marks for an answer that is close to the correct one. ChatGPT’s performance was calculated as a percentage for each section using the official UKFPO scoring templates. We also determined the proportion of questions for which the answers were correct (defined as scoring 100% of the available marks for the given question), mostly correct (50%-99%), and mostly incorrect (&#60;50%) for each section.</p>
        <p>The final SJT score provided to candidates is on a scale from 0 to 50, which is based on test-equating and scaling the raw marks achieved on the paper. This conversion formula varies between sittings and is not made publicly available by the UKFPO. We, therefore, reported ChatGPT’s performance as a percentage instead of reporting it on the 0-50–point scale, which is normally used to compare performance between human candidates. Both the SJT and Educational Performance Measure scores determine a final-year medical student’s ranking when applying to the Foundation Programme. The Educational Performance Measure is a measure of performance in medical school up to the point of application to the Foundation Programme with students grouped into deciles.</p>
      </sec>
      <sec>
        <title>Good Medical Practice Guidelines</title>
        <p>In order to assess ChatGPT’s performance across the different domains of Good Medical Practice, each question was categorized into at least 1 domain. To classify the questions, we used the 2023 practice paper answer sheet provided by the UKFPO, which also contains the rationale for most answers. Many of the rationales contained direct references to at least 1 domain from the Good Medical Practice guidelines that were used for categorization. Questions with rationales, which had missing links to the domains, were categorized by 2 independent reviewers on the basis of both the question itself and the rationale provided by the UKFPO. Both reviewers recently completed the Foundation Programme on which this SJT examination is based. The reviewers were blinded to each other’s categorization of each question, and disagreements were resolved by a third reviewer who was a consultant radiologist within the National Health Service and was blinded to the categorizations made by the 2 initial reviewers. Once all questions in the examination were assigned domains combining the rationales offered by the UKFPO and the screening approach used for the remaining questions, ChatGPT’s performance in each domain of Good Medical Practice was assessed using the official scoring templates and reported as a percentage.</p>
        <p>A summary of the workflow for this study including sourcing, encoding, adjudicating results, and assessing performance can be found in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>A schematic workflow of sourcing, encoding, adjudicating results, and assessing performance for this study.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="810"/>
            <thead>
              <tr valign="top">
                <td>Workflow step</td>
                <td>Description</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Sourcing</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Official 2023 UKFPO<sup>a</sup> SJT<sup>b</sup> practice paper with questions, answers and rationales for each answer</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Encoding in ChatGPT</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The following was inputted into ChatGPT for each question:</p>
                      <list>
                        <list-item>
                          <p>
                    Official candidate examination instructions
                  </p>
                        </list-item>
                        <list-item>
                          <p>
                    Question from the practice paper
                  </p>
                        </list-item>
                        <list-item>
                          <p>
                    “Provide your rational for each answer”
                  </p>
                        </list-item>
                      </list>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Adjudicating results</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Official UKFPO scoring templates used as a reference for correct answers</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Assessing performance</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Percentage of total possible marks</p>
                    </list-item>
                    <list-item>
                      <p>Proportion of questions for which the answers were correct (100%), mostly correct (50%-99%), and mostly incorrect (&#60;50%)</p>
                    </list-item>
                    <list-item>
                      <p>Percentage of total possible marks within each domain of Good Medical Practice</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>UKPFO: UK Foundation Programme Office.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>SJT: Situational Judgement Test.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study did not involve human or animal participants and ethics approval was not required.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overall Performance</title>
        <p>Overall, ChatGPT scored 76% (929 of a possible 1217 marks) on this SJT examination (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p>
        <p>For the rating section of the examination, ChatGPT scored 78% (197/253 marks) with 0% (0/18 questions) entirely correct, 100% (18/18 questions) mostly correct, and 0% (0/18 questions) mostly incorrect responses (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <p>For the multiple-choice section of the examination, ChatGPT scored 65% (172/264 marks) with 23% (5/22 questions) entirely correct, 50% (11/22 questions) mostly correct, and 27% (6/22 questions) mostly incorrect responses (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <p>For the ranking section of the examination, ChatGPT scored 80% (560/700 marks) with 6% (2/35 questions) entirely correct, 94% (33/35 questions) mostly correct, and 0% (0/35 questions) mostly incorrect responses (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>ChatGPT’s performance in each section of the examination depicting the proportion of entirely correct (100%), mostly correct (50%-99%), or mostly incorrect answers (&#60;50%). MCQ: multiple-choice question. Q: question.</p>
          </caption>
          <graphic xlink:href="mededu_v9i1e48978_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Good Medical Practice Domains</title>
        <p>There were 170 questions and answer statements that were classified into at least one of the GMC domains. Of these, 84 (49% of the total) were explicitly linked to a GMC domain within the rationale provided in the UKFPO’s official answer sheet. The independent reviewers then screened the remaining 86 and agreed on which GMC domains they applied to for 76 (88%) of them. The remaining 10 were then assessed by the tiebreaker (consultant radiologist), and their labels were used for the analysis.</p>
        <p>ChatGPT scored 78% (328/419) in the Knowledge, Skills and Performance domain, 76% (484/635) in the Safety and Quality domain, 76% (340/448) in the Maintaining Trust domain, and 75% (784/1046) in the Communication, Partnership and Teamwork domain.</p>
      </sec>
      <sec>
        <title>Answers With the Biggest Discrepancies</title>
        <p>In the rating section, ChatGPT’s worst performance was noted on a question related to the appropriateness of specific actions after discovering that a medical student has likely acquired detailed information about the scenarios that will feature in an upcoming examination. The official answers and rationale advise that it would be (1) somewhat appropriate to inform the medical student that their Educational Supervisor will be informed about the situation and (2) very appropriate to encourage the student to inform the medical school that they have acquired information about the examination. ChatGPT labeled these options as inappropriate and somewhat inappropriate, respectively, with the rationale that (1) “Threatening to inform her Educational Supervisor about her behaviour is not a productive or supportive approach…It is important to remember that as a facilitator, the doctor’s role is to support and guide the student in their learning, not to police their behaviour” and (2) “While it is important [for the student] to be honest about how she obtained information, encouraging her to declare this to the medical school may be premature at this point. It may be more appropriate to first have a conversation with [the student] to understand why she sought additional information and to provide guidance on appropriate conduct.”</p>
        <p>In the ranking section of the examination, ChatGPT scored its lowest marks (50%) on a question asking about the appropriateness of certain actions when one has arrived home after one’s shift and realizes that they forgot to handover an urgent blood sample that needs to be taken today. ChatGPT deemed returning to the ward immediately to perform the blood test as the most appropriate action, whereas the official marking labeled this is as one of the less appropriate options. ChatGPT also ranked telephoning the ward and leaving a message with the nursing team as a less appropriate option because “the nursing team may not have the necessary information or authority to take appropriate action for the patient,” while the official marking classified this as one of the more appropriate actions.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>We evaluated the performance of ChatGPT on the SJT: a national examination for final-year medical students in the United Kingdom, which assesses attributes including communication, teamwork, patient safety, prioritization skills, professionalism, and ethics. Overall, ChatGPT scored 76% on the examination. It answered 0%, 23%, and 6% of the questions entirely correctly in the rating, multiple-choice, and ranking sections of the examination, respectively, but was mostly correct for 100%, 50%, and 94% of the questions in these sections. ChatGPT scored consistently across the 4 key domains of Good Medical Practice.</p>
        <p>ChatGPT’s overall performance was impressive considering that it was correct or mostly correct for the majority of questions in the examination. However, the proportion of the questions that were answered with 100% accuracy was lower than expected with its best performance being in the multiple-choice section, in which it chose the 3 correct options in approximately one-fourth of the questions. This could be due to flaws in ChatGPT’s reasoning in some of these situations. However, ChatGPT’s low proportion of entirely correct (100%) answers may also reflect inconsistencies within the examination itself. Several reports and student commentaries have suggested that there are significant discrepancies in the correct answers chosen by different experts [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. If this is the case, the inconsistencies in the rationale underlying different questions and the official answers offered by the UKFPO may contribute to worse performance by ChatGPT on the examination. It is interesting to note that for some of the answers where ChatGPT significantly deviated from the official UKFPO answers, ChatGPT’s rationale for its answers came across as reasonable and insightful and would likely resonate with many candidates compared to the official answers provided by the UKFPO. It also raises the question of how large language models, such as ChatGPT, could be used to help standardize these types of situational judgement and professionalism examinations, by providing consistent answers and rationale throughout. In this context, ChatGPT could also serve as a preparation tool for prospective SJT candidates, although it is important to consider whether the ethical implications of this technology could widen disparities. For example, concerns have been raised regarding differential attainment between candidates from different ethnic groups with SJT questions potentially enforcing cultural biases [<xref ref-type="bibr" rid="ref13">13</xref>]. ChatGPT and other AI language models may inherit biases from the data that they are trained on [<xref ref-type="bibr" rid="ref17">17</xref>] and, hence, may reinforce these cultural biases in the context of the SJT. Access to these technologies, both in terms of awareness and financial capacity may also further widen these disparities in performance instead of promoting equality and ensuring that the test is solely assessing aptitude.</p>
        <p>Interestingly, ChatGPT scored 65% in the multiple-choice section versus 78% and 80% in the rating and ranking sections, respectively. This may reflect that this large language model is better suited to tasks that involve ranking and prioritization rather than selecting from a list of most appropriate, or relevant, options for a given scenario. ChatGPT has been trained on a wide gamut of data available from the internet, which may not always be factually correct, but amalgamated together means that the model may be more competent at dealing with open-ended questions which involve listing options in order of importance or relevance, as opposed to questions with individual correct answers.</p>
        <p>ChatGPT performed consistently across the 4 domains of Good Medical Practice, having scored between 75% and 78% across them. ChatGPT performed slightly better in the Knowledge, Skills and Performance and Safety and Quality domains than in the Communication, Partnership and Teamwork domain. We speculate that this could be explained by questions pertaining to knowledge and safety being more objective in nature, whereby patient safety and delivering high-quality care are always prioritized. These types of scenarios may provide ChatGPT with a more straightforward approach to classifying the appropriateness of the options, compared to questions pertaining to communication and teamwork where decision-making is more subjective and nuanced. However, the differences in ChatGPT’s performance across these domains were too small to provide more definitive insight.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>There were several limitations in this study: First, in practice, the raw score for this examination is converted to a 0-50–point scale, which is based on test-equating and a scaling conversion method that is not publicly available. We also do not have access to the results of medical students taking this practice examination and are therefore unable to directly compare ChatGPT’s performance to that of final-year medical students. Second, the answer sheet and rationales provided by the UKFPO for this examination only explicitly linked 49% of the questions and answer statements to the GMC domains outlined in Good Medical Practice. We therefore devised a method to link the remaining questions to the domains, which involved 2 independent reviewers and a tiebreaker, the results of which may have differed from those of the UKFPO. Third, many questions pertained to more than 1 domain of Good Medical Practice; hence, there was an overlap in questions across different domains when assessing ChatGPT’s performance in each domain. Fourth, our search was run on the February 2023 version of ChatGPT, and given the constant development of this large language model, future iterations may yield different outcomes.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Overall, ChatGPT performed well in the examination but scored 100% for only a few questions, which may reflect inconsistencies in the examination or errors in ChatGPT’s reasoning (or both). This builds on the existing literature by demonstrating that AI-driven large language models such as ChatGPT not only perform well on a wide range of clinically based examinations, but also offer, for the most part, rational responses to professional and ethical dilemmas faced by doctors. Future research should focus on identifying patterns and inconsistencies in the ethical approaches of AI language models and mitigating potential biases in them. Directly comparing the performance of these types of models with that of human candidates in relation to situational judgement dilemmas will provide more direct insight into their performance relative to that of humans. If the ethical foundations of models such as ChatGPT are deemed appropriate and reliable, it would provide the opportunity for integration directly into medical education with, for example, interactive platforms, simulated scenarios related to situational judgement, and personalized feedback, as well as standardization of examinations. Finally, in order to achieve this, it will be crucial to use a collaborative approach among experts in AI, medicine, and medical education to realize the full potential of these new technologies. Addressing these points will help develop this field and promote the integration of large language models, such as ChatGPT, into medical education, thus helping to standardize assessments that evaluate professionalism and ethics while maintaining high-quality and equitable medical education standards.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Situational Judgement Test templates.</p>
        <media xlink:href="mededu_v9i1e48978_app1.docx" xlink:title="DOCX File , 13 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Supporting information—ChatGPT output.</p>
        <media xlink:href="mededu_v9i1e48978_app2.docx" xlink:title="DOCX File , 78 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Raw scores and GMC domains for each question in the SJT exam. GMC: General Medical Council; SJT: Situational Judgement Test.</p>
        <media xlink:href="mededu_v9i1e48978_app3.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 11 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GMC</term>
          <def>
            <p>General Medical Council</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GP</term>
          <def>
            <p>general practitioner</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">SJT</term>
          <def>
            <p>Situational Judgement Test</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">UKFPO</term>
          <def>
            <p>UK Foundation Programme Office</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Herbert-Voss</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krueger</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Henighan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ziegler</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winter</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hesse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sigler</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Litwin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chess</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Berner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>McCandlish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Amodei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <source>arXiv.</source>
          <comment>Preprint posted online May 28, 2020.</comment>
          <pub-id pub-id-type="doi">10.5860/choice.189890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Hickman</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Monahan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarcz</surname>
              <given-names>DB</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT Goes to Law School</article-title>
          <source>SSRN Journal</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.2139/ssrn.4335905</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Terwiesch</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Would Chat GPT Get a Wharton MBA? New White Paper By Christian Terwiesch</article-title>
          <source>Mack Institute for Innovation Management at the Wharton School, University of Pennsylvania</source>
          <year>2023</year>
          <access-date>2023-07-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mackinstitute.wharton.upenn.edu/2023/would-chat-gpt3-get-a-wharton-mba-new-white-paper-by-christian-terwiesch/">https://mackinstitute.wharton.upenn.edu/2023/would-chat-gpt3-get-a-wharton-mba-new-white-paper-by-christian-terwiesch/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>Camille</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: Potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannos</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Delardas</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on UK Standardized Admission Tests: Insights From the BMAT, TMUA, LNAT, and TSA Examinations</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>04</month>
          <day>26</day>
          <volume>9</volume>
          <fpage>e47737</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e47737/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47737</pub-id>
          <pub-id pub-id-type="medline">37099373</pub-id>
          <pub-id pub-id-type="pii">v9i1e47737</pub-id>
          <pub-id pub-id-type="pmcid">PMC10173042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sanghera</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Barzangi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>El Mukashfi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Trialling a Large Language Model (ChatGPT) in General Practice With the Applied Knowledge Test: Observational Study Demonstrating Opportunities and Limitations in Primary Care</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>04</month>
          <day>21</day>
          <volume>9</volume>
          <fpage>e46599</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46599/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46599</pub-id>
          <pub-id pub-id-type="medline">37083633</pub-id>
          <pub-id pub-id-type="pii">v9i1e46599</pub-id>
          <pub-id pub-id-type="pmcid">PMC10163403</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannos</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the limits of AI in medical specialisation: ChatGPT's performance on the UK Neurology Specialty Certificate Examination</article-title>
          <source>BMJ Neurol Open</source>
          <year>2023</year>
          <month>06</month>
          <day>15</day>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>e000451</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37337531"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjno-2023-000451</pub-id>
          <pub-id pub-id-type="medline">37337531</pub-id>
          <pub-id pub-id-type="pii">bmjno-2023-000451</pub-id>
          <pub-id pub-id-type="pmcid">PMC10277081</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mbakwe</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Lourentzou</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mechanic</surname>
              <given-names>OJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dagan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT passing USMLE shines a spotlight on the flaws of medical education</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000205</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812618"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000205</pub-id>
          <pub-id pub-id-type="medline">36812618</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-23-00027</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931307</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
          <article-title>Good medical practice</article-title>
          <source>General Medical Council</source>
          <access-date>2023-03-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.gmc-uk.org/ethical-guidance/ethical-guidance-for-doctors/good-medical-practice">https://www.gmc-uk.org/ethical-guidance/ethical-guidance-for-doctors/good-medical-practice</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schubert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ortwein</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dumitsch</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schwantes</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Wilhelm</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Kiessling</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A situational judgement test of professional behaviour: development and validation</article-title>
          <source>Med Teach</source>
          <year>2008</year>
          <month>06</month>
          <day>03</day>
          <volume>30</volume>
          <issue>5</issue>
          <fpage>528</fpage>
          <lpage>33</lpage>
          <pub-id pub-id-type="doi">10.1080/01421590801952994</pub-id>
          <pub-id pub-id-type="medline">18576192</pub-id>
          <pub-id pub-id-type="pii">792746518</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Medical students' perceptions of the situational judgement test: a mixed methods study</article-title>
          <source>Br J Hosp Med (Lond)</source>
          <year>2015</year>
          <month>04</month>
          <day>02</day>
          <volume>76</volume>
          <issue>4</issue>
          <fpage>234</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.12968/hmed.2015.76.4.234</pub-id>
          <pub-id pub-id-type="medline">25853355</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beesley</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>BHL</given-names>
            </name>
          </person-group>
          <article-title>Situational judgment tests: Who knows the right answers?</article-title>
          <source>Medical Teacher</source>
          <year>2017</year>
          <month>08</month>
          <day>24</day>
          <volume>39</volume>
          <issue>12</issue>
          <fpage>1293</fpage>
          <lpage>1294</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159x.2017.1367766</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nabavi</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>How appropriate is the situational judgment test in assessing future foundation doctors?</article-title>
          <source>BMJ</source>
          <year>2023</year>
          <month>01</month>
          <day>13</day>
          <volume>380</volume>
          <fpage>101</fpage>
          <pub-id pub-id-type="doi">10.1136/bmj.p101</pub-id>
          <pub-id pub-id-type="medline">36639167</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <article-title>ChatGPT General FAQ</article-title>
          <source>OpenAI</source>
          <access-date>2023-03-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://help.openai.com/en/articles/6783457-chatgpt-general-faq">https://help.openai.com/en/articles/6783457-chatgpt-general-faq</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Exploring the Limits of ChatGPT for Query or Aspect-based Text Summarization</article-title>
          <source>arXiv.</source>
          <comment>Preprint posted online February 16, 2023.</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2302.08081</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>UKFPO</collab>
          </person-group>
          <article-title>Practice SJT papers</article-title>
          <source>UK Foundation Programme</source>
          <access-date>2023-04-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://foundationprogramme.nhs.uk/resources/situational-judgement-test-sjt/practice-sjt-papers/">https://foundationprogramme.nhs.uk/resources/situational-judgement-test-sjt/practice-sjt-papers/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>PP</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: A comprehensive review on background, applications, key challenges, bias, ethics, limitations and future scope</article-title>
          <source>Internet of Things and Cyber-Physical Systems</source>
          <year>2023</year>
          <volume>3</volume>
          <fpage>121</fpage>
          <lpage>154</lpage>
          <pub-id pub-id-type="doi">10.1016/j.iotcps.2023.04.003</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
