<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e56342</article-id><article-id pub-id-type="doi">10.2196/56342</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Assessing the Ability of a Large Language Model to Score Free-Text Medical Student Clinical Notes: Quantitative Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Burke</surname><given-names>Harry B</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hoang</surname><given-names>Albert</given-names></name><degrees>PhD, DSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lopreiato</surname><given-names>Joseph O</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>King</surname><given-names>Heidi</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hemmer</surname><given-names>Paul</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Montgomery</surname><given-names>Michael</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gagarin</surname><given-names>Viktoria</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Uniformed Services University of the Health Sciences</institution>, <addr-line>Bethesda</addr-line><addr-line>MD</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>Defense Health Agency</institution>, <addr-line>Falls Church</addr-line><addr-line>VA</addr-line>, <country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>DiGiammarino</surname><given-names>Alicia</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chartash</surname><given-names>David</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Harry B Burke, MD, PhD, Uniformed Services University of the Health Sciences, Bethesda, MD, 20814, United States, 1 301-938-2212; <email>harry.burke@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>25</day><month>7</month><year>2024</year></pub-date><volume>10</volume><elocation-id>e56342</elocation-id><history><date date-type="received"><day>15</day><month>01</month><year>2024</year></date><date date-type="rev-recd"><day>22</day><month>02</month><year>2024</year></date><date date-type="accepted"><day>06</day><month>05</month><year>2024</year></date></history><copyright-statement>&#x00A9; Harry B Burke, Albert Hoang, Joseph O Lopreiato, Heidi King, Paul Hemmer, Michael Montgomery, Viktoria Gagarin. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 25.7.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2024/1/e56342"/><abstract><sec><title>Background</title><p>Teaching medical students the skills required to acquire, interpret, apply, and communicate clinical information is an integral part of medical education. A crucial aspect of this process involves providing students with feedback regarding the quality of their free-text clinical notes.</p></sec><sec><title>Objective</title><p>The goal of this study was to assess the ability of ChatGPT 3.5, a large language model, to score medical students&#x2019; free-text history and physical notes.</p></sec><sec sec-type="methods"><title>Methods</title><p>This is a single-institution, retrospective study. Standardized patients learned a prespecified clinical case and, acting as the patient, interacted with medical students. Each student wrote a free-text history and physical note of their interaction. The students&#x2019; notes were scored independently by the standardized patients and ChatGPT using a prespecified scoring rubric that consisted of 85 case elements. The measure of accuracy was percent correct.</p></sec><sec sec-type="results"><title>Results</title><p>The study population consisted of 168 first-year medical students. There was a total of 14,280 scores. The ChatGPT incorrect scoring rate was 1.0%, and the standardized patient incorrect scoring rate was 7.2%. The ChatGPT error rate was 86%, lower than the standardized patient error rate. The ChatGPT mean incorrect scoring rate of 12 (SD 11) was significantly lower than the standardized patient mean incorrect scoring rate of 85 (SD 74<italic>; P</italic>=.002).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>ChatGPT demonstrated a significantly lower error rate compared to standardized patients. This is the first study to assess the ability of a generative pretrained transformer (GPT) program to score medical students&#x2019; standardized patient-based free-text clinical notes. It is expected that, in the near future, large language models will provide real-time feedback to practicing physicians regarding their free-text notes. GPT artificial intelligence programs represent an important advance in medical education and medical practice.</p></sec></abstract><kwd-group><kwd>medical education</kwd><kwd>generative artificial intelligence</kwd><kwd>natural language processing</kwd><kwd>ChatGPT</kwd><kwd>generative pretrained transformer</kwd><kwd>standardized patients</kwd><kwd>clinical notes</kwd><kwd>free-text notes</kwd><kwd>history and physical examination</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>medical student</kwd><kwd>medical students</kwd><kwd>clinical information</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>patients</kwd><kwd>patient</kwd><kwd>medicine</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Teaching medical students the skills required to acquire, interpret, apply, and communicate medical information is an integral part of medical education. A crucial aspect of this process involves providing students with feedback regarding the quality of their free-text clinical notes. Various methods have been used to systematically assess clinical notes, notably, QNOTE [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], but they depend on human raters. This reliance presents numerous challenges, including rater recruitment and training as well as raters&#x2019; availability and inclination to perform reviews. Furthermore, humans are susceptible to biases, fatigue, and misinterpretation.</p><p>An attractive innovative alternative to human raters is to use a natural language processing (NLP) program to score student notes. An NLP program is a computer-based algorithm that automatically detects specific meanings in free text. The potential advantages of using an NLP program to grade student notes include the following: it is systematic; it is objective; it avoids human bias, fatigue, and misinterpretation; it is essentially free to run; it can assess any number of notes in seconds; and it can grade notes in real time to provide immediate student feedback.</p><p>A new type of NLP program was introduced in November 2022, namely, ChatGPT 3.5 (OpenAI) [<xref ref-type="bibr" rid="ref3">3</xref>], a large language model (LLM) based on the generative pretrained transformer (GPT) artificial intelligence algorithm. It has achieved a 91.7% score on the United States Medical Licensing Examination (USMLE) style questions [<xref ref-type="bibr" rid="ref4">4</xref>]. Furthermore, it scored 87.3% on a clinical knowledge test, 91.7% on medical genetics, 89.2% on anatomy, and 92.4% on professional medicine [<xref ref-type="bibr" rid="ref4">4</xref>]. Its medical-related capabilities include improving clinician empathy [<xref ref-type="bibr" rid="ref5">5</xref>], responding to patient questions [<xref ref-type="bibr" rid="ref6">6</xref>], performing differential diagnoses [<xref ref-type="bibr" rid="ref7">7</xref>], classifying radiology reports [<xref ref-type="bibr" rid="ref8">8</xref>], writing discharge summaries [<xref ref-type="bibr" rid="ref9">9</xref>], providing accurate prevention advices to patients [<xref ref-type="bibr" rid="ref10">10</xref>], and predicting suicide risk [<xref ref-type="bibr" rid="ref11">11</xref>]. ChatGPT has been compared to human raters in terms of grading short-answer preclerkship medical questions. The ChatGPT-human Spearman correlations for a single assessor ranged from 0.6 to 0.7 [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>We assessed ChatGPT&#x2019;s ability to accurately score medical students&#x2019; free-text notes on history of present illness, physical examination, and assessment and plan. We compared these scores to standardized patients&#x2019; scoring of the clinical notes. We hypothesized that ChatGPT would be more accurate than standardized patients. To our knowledge, this is the first study to assess the ability of a GPT program to score medical students&#x2019; standardized, patient-based, clinical free-text notes.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Procedure</title><p>This was a single institution, retrospective study. Standardized patients were people who volunteered to interact with medical students to assist in their clinical training. They were trained on a prespecified medical case, and acting as the patient, they interacted with first-year medical students, simulating a patient with that condition. This included responding to clinical questions and undergoing an examination by the medical student. The students documented their interaction with standardized patients in free-text clinical notes. They wrote a chief complaint; history of the present illness; review of systems; physical examination; and differential diagnosis, featuring 3 rank-ordered diagnoses. In addition, they provided their pertinent positives and negatives and suggested follow-up tests. At our medical school, standardized patients provided verbal feedback to students regarding their interaction and scored their students&#x2019; notes. They had 7&#x2010;10 days to score the student notes and send the results to the course instructor. They did not provide any grading feedback to the students. The advantage of using standardized patients over actual patients for training medical students is that the medical students&#x2019; experiences, and therefore, their clinical notes are based on a consistent clinical presentation.</p><p>The study case and scoring rubric, &#x201C;Suzy Whitworth,&#x201D; were developed by the Association for Standardized Patient Educators and adapted by the Mid-Atlantic Consortium of Clinical Skills Centers in June 2018, with additional formatting edits in January 2019. The standardized patients were trained on this case and its scoring rubric. The case contained 85 scorable elements that were expected to be present in the students&#x2019; notes. Three scoring rubric examples were as follows: &#x201C;Notes chief complaint of shortness of breath (shortness of breath, dyspnea, difficulty breathing, and can&#x2019;t catch my breath)&#x201D;; &#x201C;Notes sudden onset (acute, all of the sudden, and all at once&#x201D;; and &#x201C;Notes timing (a few hours ago, this morning, upon awakening, or today).&#x201D; The rubric combined the 85 scorable elements into 12 classes. ChatGPT and the standardized patients scored as either correct or incorrect each of the 85 elements in the deidentified students&#x2019; notes. An error was either an incorrect answer or the absence of an answer. A reviewer checked the standardized patient scoring and the ChatGPT scoring and a second reviewer checked the first reviewer&#x2019;s scores.</p><p>ChatGPT is an LLM based on the GPT artificial intelligence algorithm. It was pretrained on 45 TB of data and it consists of attention, which connects and weights natural language meanings, and an artificial neural network, which organizes and stores the meanings [<xref ref-type="bibr" rid="ref13">13</xref>]. It accepts natural language input and provides natural language output. For each medical student and for each rubric, the researcher created a new prompt that asked ChatGPT if the rubric&#x2019;s meaning was contained in the student&#x2019;s free-text note.</p><p>For ChatGPT and standardized patients, the measure of accuracy was the percent correct for each of the 12 categories and across the 12 categories. Student <italic>t</italic> tests (2-tailed) compared the mean error rate across the 12 classes for ChatGPT with the mean error rate across the 12 classes for the standardized patients using the R language (R Project for Statistical Computing) [<xref ref-type="bibr" rid="ref14">14</xref>].</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Ethical approval was waived as per section 46.104(d) of Code of Federal Regulations, as this was a quality improvement project [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>The study population consisted of 168 first-year medical students, the case scoring rubric consisted of 85 elements, resulting in a total of 14,280 scores. There were 4 standardized patients, each working with one-fourth of the students. The incorrect scoring (error) rates for the standardized patients and ChatGPT are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Incorrect scoring rates for ChatGPT and the standardized patients across free-text note categories and across all categories.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Scoring opportunities for the 168 students, n</td><td align="left" valign="bottom">Standardized patient errors, n (%)</td><td align="left" valign="bottom">ChatGPT errors, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Chief complaint</td><td align="left" valign="top">840</td><td align="left" valign="top">135 (16.1)</td><td align="left" valign="top">17 (2.0)</td></tr><tr><td align="left" valign="top">History of present illness</td><td align="left" valign="top">1512</td><td align="left" valign="top">226 (14.9)</td><td align="left" valign="top">35 (2.3)</td></tr><tr><td align="left" valign="top">Review of systems</td><td align="left" valign="top">1008</td><td align="left" valign="top">67 (6.6)</td><td align="left" valign="top">7 (0.7)</td></tr><tr><td align="left" valign="top">Past medical history</td><td align="left" valign="top">1512</td><td align="left" valign="top">43 (2.8)</td><td align="left" valign="top">21 (1.4)</td></tr><tr><td align="left" valign="top">Physical exam</td><td align="left" valign="top">2352</td><td align="left" valign="top">181 (7.7)</td><td align="left" valign="top">25 (1.1)</td></tr><tr><td align="left" valign="top">Diagnosis (pulmonary embolism)</td><td align="left" valign="top">168</td><td align="left" valign="top">3 (1.8)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Pulmonary embolism evidence</td><td align="left" valign="top">2352</td><td align="left" valign="top">182 (7.7)</td><td align="left" valign="top">8 (0.3)</td></tr><tr><td align="left" valign="top">Diagnosis (pneumonia)</td><td align="left" valign="top">168</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Pneumonia evidence</td><td align="left" valign="top">1848</td><td align="left" valign="top">66 (3.6)</td><td align="left" valign="top">4 (0.2)</td></tr><tr><td align="left" valign="top">Diagnosis (pneumothorax)</td><td align="left" valign="top">168</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">7 (4.2)</td></tr><tr><td align="left" valign="top">Pneumothorax evidence</td><td align="left" valign="top">1176</td><td align="left" valign="top">54 (4.6)</td><td align="left" valign="top">5 (0.4)</td></tr><tr><td align="left" valign="top">Diagnostic studies</td><td align="left" valign="top">1008</td><td align="left" valign="top">66 (6.5)</td><td align="left" valign="top">16 (1.6)</td></tr><tr><td align="left" valign="top">Total<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">14,280</td><td align="left" valign="top">1023 (7.2)</td><td align="left" valign="top">145 (1.0)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ChatGPT versus standardized patient; <italic>P</italic>=.002.</p></fn></table-wrap-foot></table-wrap><p>The category error rates for standardized patients and ChatGPT, respectively, were as follows: chief complaint: 135, 17; history of present illness: 226, 35; review of systems: 67, 7; past medical history: 43, 21; physical examination: 181, 25; first diagnosis: 3, 0; evidence for first diagnosis: 182, 8; second diagnosis: 0, 0; evidence for second diagnosis: 66, 4; third diagnosis: 0, 7; evidence for third diagnosis: 54, 5; and diagnostic studies: 66, 16. The ChatGPT incorrect scoring rate was 1.0%, and the standardized patient incorrect scoring rate was 7.2%. The ChatGPT error rate was 86% lower than the standardized patient error rate. The ChatGPT mean incorrect scoring rate of 12 (SD 11) was significantly lower than the standardized patient mean incorrect scoring rate of 85 (SD 74; <italic>P</italic>=.002).</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>ChatGPT had a significantly lower error rate compared to standardized patients. This suggests that an LLM can be used to score medical students&#x2019; notes.</p><p>NLP programs have been used in several medical education settings. Medical education NLPs have been based on keywords, expert systems, statistical algorithms, and combinations of these approaches. DaSilva and Dennick [<xref ref-type="bibr" rid="ref16">16</xref>] transcribed medical student problem-based verbal learning sessions and used an NLP program to count the frequency of technical words. Zhang et al [<xref ref-type="bibr" rid="ref17">17</xref>] implemented both a na&#x00EF;ve Bayes approach and a supervised support vector machine method to assess resident performance evaluations. Their sentiment accuracies were 0.845 for na&#x00EF;ve Bayes and 0.937 for the support vector machine. Spickard et al [<xref ref-type="bibr" rid="ref18">18</xref>] used an electronic scoring system to detect 25 core clinical problems in medical students&#x2019; clinical notes. They achieved a 75% positive predictive value (PPV) on 16 of the 25 problems. Denny et al [<xref ref-type="bibr" rid="ref19">19</xref>] examined whether students mentioned advance directives or altered mental status in their clinical notes. For advance directives, their sensitivity was 69% and their PPV was 100%, and for mental status, their sensitivity was 100% and their PPV was 93%. Sarker et al [<xref ref-type="bibr" rid="ref20">20</xref>] used a semisupervised NLP method to assess students&#x2019; free-text notes. Their accuracy over 21 cases and 105 notes was a sensitivity of 0.91 and a PPV of 0.87. Two recent papers from the University of Michigan&#x2019;s Department of Surgery [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] assessed resident feedback and competency. Solano et al [<xref ref-type="bibr" rid="ref21">21</xref>] dichotomized the narrative surgical feedback given to residents into high and low quality and trained a logistic regression model to distinguish between them. Their model achieved a sensitivity of 0.37, a specificity of 0.97, and a receiver operating characteristic (ROC) of 0.86. Otles et al [<xref ref-type="bibr" rid="ref22">22</xref>] assessed narrative surgical resident feedback using a variety of statistical methods. The support vector machine algorithm achieved the best result with a maximum mean accuracy of 0.64. Abbott [<xref ref-type="bibr" rid="ref23">23</xref>] studied whether an NLP program could assess the clinical competency committee ratings of residents in terms of language that correlated with the 16 Accreditation Council for Graduate Medical Education Milestones. The ROCs for the 16 milestones ranged from 0.71 to 0.95 and the mean ROC was 0.83. Neves et al [<xref ref-type="bibr" rid="ref24">24</xref>] examined the ability of RapidMiner Studio, a machine learning program, to assess the quality of attending feedback on resident performance. Their accuracies ranged from 74.4% to 82.2%.</p><p>If NLP programs are to be used to automate the grading of students&#x2019; notes, they must achieve an acceptable accuracy. Sarker et al [<xref ref-type="bibr" rid="ref20">20</xref>] suggested that any method of scoring medical notes should achieve an accuracy close to 100%. Regrettably, none of the reported medical education NLPs achieved an acceptable accuracy. In our study, standardized patients also failed to achieve an acceptable accuracy. ChatGPT attained an accuracy close to 100% and is, therefore, suitable for scoring students&#x2019; free-text notes.</p><p>A potential limitation of this study is that it has been suggested that GPT-based methods have the potential to generate unreliable answers under certain circumstances. We did not find that to be true in our study. Another potential limitation is that, although ChatGPT is free to the public, it has resource requirements. It used 45 TB of data, it has 175 billion parameters, and it runs on supercomputers residing in the cloud. This is a great deal of computing power for student notes. Fortunately, there are open-source GPTs, for example, Meta&#x2019;s Llama, that can be run on a workstation. We would have liked to examine the standardized patient validity literature, but to our knowledge, no such study exists. Finally, assessing note errors does not directly address clinical reasoning.</p><p>An important advantage of LLMs is their ability to provide real-time scoring and feedback on student clinical free-text notes. This immediate assessment offers students a valuable learning opportunity because they can reflect on their performance while the clinical interaction is still fresh in their mind. Another advantage is that the scoring is accurate and objective so students will no longer have to worry about human error and bias. A disadvantage of ChatGPT was that it was time consuming. Fortunately, there are compound GPTs that can perform the entire assessment of all the elements and all the students at once. In terms of clinical reasoning, in the future, we will be asking medical students, as part of their clinical note write-up, to provide their clinical reasoning and we can have a GPT assess the quality of their reasoning.</p><p>It should be noted that the use of LLMs to score clinical notes need not be limited to medical students. It is expected that in the near future, GPT-based artificial intelligence NLPs will be applied to provide real-time feedback on free-text clinical notes to practicing physicians.</p><p>In conclusion, ChatGPT demonstrated a significantly lower error rate compared to standardized patients. This is the first study to assess the ability of a GPT program to score medical students&#x2019; standardized, patient-based, free-text clinical notes. GPT artificial intelligence programs represent an important advance in medical education and medical practice.</p></sec></body><back><ack><p>Support for this project was provided by the Patient Safety and Quality Academic Collaborative, a joint Defense Health Agency-Uniformed Services University program. The funder did not participate in the design, execution, or analysis of this project.</p><p>The opinions and assertions expressed in this paper are those of the authors and do not reflect the official policy or position of the Department of Defense, the Defense Health Agency, or the Uniformed Services University of the Health Sciences.</p></ack><notes><sec><title>Data Availability</title><p>The datasets used in this study are not publicly available because they include student scores, but they are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>HBB, AH, JOL, and PH made substantial contributions to the conception and design of the work; HBB, AH, JOL, HK, MM, and VK made substantial contributions to the acquisition, analysis, and interpretation of the data; HBB wrote the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">GPT</term><def><p>generative pretrained transformer</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb4">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb5">ROC</term><def><p>receiver operating characteristic</p></def></def-item><def-item><term id="abb6">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burke</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Hoang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Becher</surname><given-names>D</given-names> </name><etal/></person-group><article-title>QNOTE: an instrument for measuring the quality of EHR clinical notes</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>5</issue><fpage>910</fpage><lpage>916</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-002321</pub-id><pub-id pub-id-type="medline">24384231</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burke</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Sessums</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Hoang</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Electronic health records improve clinical note quality</article-title><source>J Am Med Inform Assoc</source><year>2015</year><month>01</month><volume>22</volume><issue>1</issue><fpage>199</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2014-002726</pub-id><pub-id pub-id-type="medline">25342178</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>OpenAI</article-title><source>ChatGPT</source><access-date>2023-08-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/chatgpt/">https://openai.com/index/chatgpt/</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Towards expert-level medical question answering with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  May 16, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.09617</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>IW</given-names> </name><name name-style="western"><surname>Miner</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Atkins</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Althoff</surname><given-names>T</given-names> </name></person-group><article-title>Human&#x2013;AI collaboration enables more empathic conversations in text-based peer-to-peer mental health support</article-title><source>Nat Mach Intell</source><year>2023</year><month>01</month><volume>5</volume><issue>1</issue><fpage>46</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.1038/s42256-022-00593-2</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yokose</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sakamoto</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kawamura</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Diagnostic accuracy of differential-diagnosis lists generated by generative pretrained transformer 3 chatbot for clinical vignettes with common chief complaints: a pilot study</article-title><source>Int J Environ Res Public Health</source><year>2023</year><month>02</month><day>15</day><volume>20</volume><issue>4</issue><fpage>3378</fpage><pub-id pub-id-type="doi">10.3390/ijerph20043378</pub-id><pub-id pub-id-type="medline">36834073</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olthof</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Shouche</surname><given-names>P</given-names> </name><name name-style="western"><surname>Fennema</surname><given-names>EM</given-names> </name><etal/></person-group><article-title>Machine learning based natural language processing of radiology reports in orthopaedic trauma</article-title><source>Comput Methods Programs Biomed</source><year>2021</year><month>09</month><volume>208</volume><fpage>106304</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2021.106304</pub-id><pub-id pub-id-type="medline">34333208</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Lam</surname><given-names>K</given-names> </name></person-group><article-title>ChatGPT: the future of discharge summaries?</article-title><source>Lancet Digit Health</source><year>2023</year><month>03</month><volume>5</volume><issue>3</issue><fpage>e107</fpage><lpage>e108</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00021-3</pub-id><pub-id pub-id-type="medline">36754724</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarraju</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bruemmer</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Iterson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rodriguez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Laffin</surname><given-names>L</given-names> </name></person-group><article-title>Appropriateness of cardiovascular disease prevention recommendations obtained from a popular online chat-based artificial intelligence model</article-title><source>JAMA</source><year>2023</year><month>03</month><day>14</day><volume>329</volume><issue>10</issue><fpage>842</fpage><lpage>844</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.1044</pub-id><pub-id pub-id-type="medline">36735264</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burkhardt</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>X</given-names> </name><name name-style="western"><surname>Kerbrat</surname><given-names>A</given-names> </name><name name-style="western"><surname>Comtois</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>T</given-names> </name></person-group><article-title>From benchmark to bedside: transfer learning from social media to patient-provider text messages for suicide risk prediction</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>05</month><day>19</day><volume>30</volume><issue>6</issue><fpage>1068</fpage><lpage>1078</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad062</pub-id><pub-id pub-id-type="medline">37043748</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morjaria</surname><given-names>L</given-names> </name><name name-style="western"><surname>Burns</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bracken</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Examining the threat of chatgpt to the validity of short answer assessments in an undergraduate medical program</article-title><source>J Med Educ Curric Dev</source><year>2023</year><volume>10</volume><fpage>23821205231204178</fpage><pub-id pub-id-type="doi">10.1177/23821205231204178</pub-id><pub-id pub-id-type="medline">37780034</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 2, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><source>The R Project for Statistical Computing</source><access-date>2024-07-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.r-project.org/">https://www.r-project.org/</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>Code of Federal Regulations</article-title><source>National Archives</source><access-date>2024-07-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ecfr.gov/current/title-45/subtitle-A/subchapter-A/part-46/subpart-A/section-46.104">https://www.ecfr.gov/current/title-45/subtitle-A/subchapter-A/part-46/subpart-A/section-46.104</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Da Silva</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Dennick</surname><given-names>R</given-names> </name></person-group><article-title>Corpus analysis of problem-based learning transcripts: an exploratory study</article-title><source>Med Educ</source><year>2010</year><month>03</month><volume>44</volume><issue>3</issue><fpage>280</fpage><lpage>288</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2923.2009.03575.x</pub-id><pub-id pub-id-type="medline">20444059</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pakhomov</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gladding</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aylward</surname><given-names>M</given-names> </name><name name-style="western"><surname>Borman-Shoap</surname><given-names>E</given-names> </name><name name-style="western"><surname>Melton</surname><given-names>GB</given-names> </name></person-group><article-title>Automated assessment of medical training evaluation text</article-title><source>AMIA Annu Symp Proc</source><year>2012</year><volume>2012</volume><fpage>1459</fpage><lpage>1468</lpage><pub-id pub-id-type="medline">23304426</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spickard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ridinger</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wrenn</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Automatic scoring of medical students&#x2019; clinical notes to monitor learning in the workplace</article-title><source>Med Teach</source><year>2014</year><month>01</month><volume>36</volume><issue>1</issue><fpage>68</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.3109/0142159X.2013.849801</pub-id><pub-id pub-id-type="medline">24195470</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Denny</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Spickard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Speltz</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Porier</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rosenstiel</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Powers</surname><given-names>JS</given-names> </name></person-group><article-title>Using natural language processing to provide personalized learning opportunities from trainee clinical notes</article-title><source>J Biomed Inform</source><year>2015</year><month>08</month><volume>56</volume><fpage>292</fpage><lpage>299</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2015.06.004</pub-id><pub-id pub-id-type="medline">26070431</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>AZ</given-names> </name><name name-style="western"><surname>Mee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Harik</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gonzalez-Hernandez</surname><given-names>G</given-names> </name></person-group><article-title>An interpretable natural language processing system for written medical examination assessment</article-title><source>J Biomed Inform</source><year>2019</year><month>10</month><volume>98</volume><fpage>103268</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103268</pub-id><pub-id pub-id-type="medline">31421211</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Solano</surname><given-names>QP</given-names> </name><name name-style="western"><surname>Hayward</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chopra</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Natural language processing and assessment of resident feedback quality</article-title><source>J Surg Educ</source><year>2021</year><volume>78</volume><issue>6</issue><fpage>e72</fpage><lpage>e77</lpage><pub-id pub-id-type="doi">10.1016/j.jsurg.2021.05.012</pub-id><pub-id pub-id-type="medline">34167908</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00D6;tle&#x015F;</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kendrick</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Solano</surname><given-names>QP</given-names> </name><etal/></person-group><article-title>Using natural language processing to automatically assess feedback quality: findings from 3 surgical residencies</article-title><source>Acad Med</source><year>2021</year><month>10</month><day>1</day><volume>96</volume><issue>10</issue><fpage>1457</fpage><lpage>1460</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000004153</pub-id><pub-id pub-id-type="medline">33951682</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbott</surname><given-names>KL</given-names> </name><name name-style="western"><surname>George</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Sandhu</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Natural language processing to estimate clinical competency committee ratings</article-title><source>J Surg Educ</source><year>2021</year><volume>78</volume><issue>6</issue><fpage>2046</fpage><lpage>2051</lpage><pub-id pub-id-type="doi">10.1016/j.jsurg.2021.06.013</pub-id><pub-id pub-id-type="medline">34266789</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neves</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Ku</surname><given-names>CM</given-names> </name><etal/></person-group><article-title>Using machine learning to evaluate attending feedback on resident performance</article-title><source>Anesth Analg</source><year>2021</year><month>02</month><day>1</day><volume>132</volume><issue>2</issue><fpage>545</fpage><lpage>555</lpage><pub-id pub-id-type="doi">10.1213/ANE.0000000000005265</pub-id><pub-id pub-id-type="medline">33323789</pub-id></nlm-citation></ref></ref-list></back></article>