<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e81673</article-id><article-id pub-id-type="doi">10.2196/81673</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>AI- vs Human-Based Assessment of Medical Interview Transcripts in a Generative AI&#x2013;Simulated Patient System: Cross-Sectional Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Takahashi</surname><given-names>Hiromizu</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shikino</surname><given-names>Kiyoshi</given-names></name><degrees>MD, MHPE, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kondo</surname><given-names>Takeshi</given-names></name><degrees>MD, MHPE, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yamada</surname><given-names>Yuji</given-names></name><degrees>MD, MPH, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tomoda</surname><given-names>Yoshitaka</given-names></name><degrees>MD, PhD, FACP</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kishi</surname><given-names>Minoru</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Aiyama</surname><given-names>Yuki</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nagai</surname><given-names>Sho</given-names></name><degrees>RN, PHN, MSN</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Enomoto</surname><given-names>Akiko</given-names></name><degrees>RN, PHN, MSN</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tokushima</surname><given-names>Yoshinori</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff10">10</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shinohara</surname><given-names>Takahiro</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff11">11</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sano</surname><given-names>Fumiaki</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Matsuura</surname><given-names>Takeshi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff12">12</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Watanabe</surname><given-names>Rikiya</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff13">13</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Naito</surname><given-names>Toshio</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of General Medicine, Faculty of Medicine, Juntendo University</institution><addr-line>3-1-3 Hongo, Bunkyo Tokyo, 1130033 Japan</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Community-Oriented Medical Education, Graduate School of Medicine, Chiba University</institution><addr-line>Chiba</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Center for Postgraduate Clinical Training and Career Development, Nagoya University Hospital</institution><addr-line>Nagoya</addr-line><country>Japan</country></aff><aff id="aff4"><institution>The School of Health Professions Education, Maastricht University</institution><addr-line>Maastricht</addr-line><country>The Netherlands</country></aff><aff id="aff5"><institution>Brookdale Department of Geriatrics and Palliative Medicine, Icahn School of Medicine at Mount Sinai</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of General Internal Medicine, Itabashi Chuo Medical Center</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff7"><institution>Department of Internal Medicine, Nishiwaki Municipal Hospital</institution><addr-line>Hyogo</addr-line><country>Japan</country></aff><aff id="aff8"><institution>Anesthesiology and Critical Care Medicine, Tenri Hospital</institution><addr-line>Nara</addr-line><country>Japan</country></aff><aff id="aff9"><institution>Department of Nursing, School of Nursing, University of Human Environments</institution><addr-line>Aichi</addr-line><country>Japan</country></aff><aff id="aff10"><institution>Department of General Medicine, Saga University Hospital</institution><addr-line>Saga</addr-line><country>Japan</country></aff><aff id="aff11"><institution>Department of General Medicine, Graduate School of Medical and Dental Sciences, Institute of Science Tokyo</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff12"><institution>Department of General Medicine, Bibai City Hospital</institution><addr-line>Hokkaido</addr-line><country>Japan</country></aff><aff id="aff13"><institution>Department of General Internal Medicine, Kita-Harima Medical Center</institution><addr-line>Hyogo</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Hasan Sapci</surname><given-names>A</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Das</surname><given-names>Arpita</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Cunha</surname><given-names>Daniel</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Simhadri</surname><given-names>Sai Yellaiah</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hiromizu Takahashi, MD, PhD, Department of General Medicine, Faculty of Medicine, Juntendo University, 3-1-3 Hongo, Bunkyo Tokyo, 1130033 Japan, Tokyo, Japan, 81 3-3813-3111; <email>hrtakaha@juntendo.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>17</day><month>2</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e81673</elocation-id><history><date date-type="received"><day>21</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>16</day><month>11</month><year>2025</year></date><date date-type="accepted"><day>14</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Hiromizu Takahashi, Kiyoshi Shikino, Takeshi Kondo, Yuji Yamada, Yoshitaka Tomoda, Minoru Kishi, Yuki Aiyama, Sho Nagai, Akiko Enomoto, Yoshinori Tokushima, Takahiro Shinohara, Fumiaki Sano, Takeshi Matsuura, Rikiya Watanabe, Toshio Naito. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 17.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2026/1/e81673"/><abstract><sec><title>Background</title><p>Generative artificial intelligence (AI) is increasingly used in medical education, including AI-based virtual patients to improve interview skills. However, how much AI-based assessment (ABA) differs from human-based assessment (HBA) remains unclear.</p></sec><sec><title>Objective</title><p>This study aimed to compare the quality of clinical interview assessments generated via an ABA (GPT-o1 Pro [ABA-o1] and GPT-5 Pro [ABA-5]) with those generated via an HBA conducted by clinical instructors in an AI-based virtual patient setting. We also examined whether AI reduced evaluation time and assessed agreement across participants with different levels of clinical experience.</p></sec><sec sec-type="methods"><title>Methods</title><p>A standardized case of leg weakness was implemented in an AI-based virtual patient. Seven participants (2 medical students, 3 residents, and 2 attending physicians) each conducted an interview with the AI patient, and transcripts were scored using the 25-item Master Interview Rating Scale (0&#x2010;125). Three evaluation strategies were compared. First, GPT-o1 Pro and GPT-5 Pro scored each transcript 5 times with different random seeds to test case specificity. Processing time was logged automatically. Second, 5 blinded clinical instructors independently rated each transcript once using the same rubric. Third, reliability metrics were applied. For AI, intraclass correlation coefficients (ICCs) quantified repeatability. For humans, the ICC(2,1) was calculated. Agreement was quantified using the Pearson <italic>r</italic>, Lin concordance correlation coefficient, Bland-Altman limits of agreement, Cronbach &#x03B1;, and ICC. Time efficiency was expressed as mean minutes per transcript and relative percentage reduction.</p></sec><sec sec-type="results"><title>Results</title><p>Mean interview scores were similar across methods (ABA-o1: mean 52.1, SD 6.9; ABA-5: mean 53.2, SD 6.8; HBA: mean 53.7, SD 6.8). Agreement between ABA and HBA was strong (<italic>r</italic>=0.90; concordance correlation coefficient=0.88) with minimal bias (ABA-o1: mean 0.4, SD 2.7; ABA-5: mean 1.5, SD 5.2; limits of agreement: &#x2013;4.9 to 5.7 for ABA-o1 and &#x2013;8.6 to 11.7 for ABA-5). The Cronbach &#x03B1; was 0.81 (ABA-o1), 0.86 (ABA-5), and 0.80 (HBA); the ICC(3,1) was 0.77 (ABA-o1) and 0.82 (ABA-5); and the ICC(2,1) was 0.38 (HBA). The coefficient of variation for ABA was approximately half that of HBA (6.6% vs 13.9%). Processing time for 5 runs was 4 minutes, 19 seconds for ABA-o1 and 3 minutes, 20 seconds for ABA-5 vs 10 minutes, 16 seconds for physicians, corresponding to 58% and 67.6% reductions, respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>ABA-o1 and ABA-5 produced scores closely matching HBA while demonstrating superior consistency and reliability. In the setting of virtual interview transcripts, these findings suggest that ABA may serve as a valid, rapid, and scalable alternative to HBA, reducing per-assessment time by over half. Applied strategically, AI-based scoring could enable timely feedback, improve efficiency, and reduce faculty workload. Further research is needed to confirm generalizability across broader settings.</p></sec></abstract><kwd-group><kwd>medical education</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>virtual patient</kwd><kwd>clinical interview</kwd><kwd>ChatGPT</kwd><kwd>simulation-based learning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Effective clinical interviewing is essential for making correct diagnoses and building strong relationships with patients [<xref ref-type="bibr" rid="ref1">1</xref>]. Traditionally, students learn these skills through supervised practice with real or standardized patients and feedback from faculty [<xref ref-type="bibr" rid="ref1">1</xref>]. However, this apprenticeship-style approach is time-intensive and limits opportunities for deliberate practice [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>The assessment component itself also consumes substantial faculty and resident physician (RP) time. In competency-based medical education (CBME), faculty complete numerous workplace-based assessment forms; one Canadian study found a mean of 3 minutes, 6 seconds per entrustable professional activity form, adding approximately 18 minutes of extra documentation time for each staff member every 4-week block [<xref ref-type="bibr" rid="ref3">3</xref>]. Multiprogram qualitative work further confirms that the cumulative &#x201C;assessment burden&#x201D; is now viewed as a major threat to sustainability, prompting programs to redesign processes to reduce administrative load [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Recently, generative artificial intelligence (AI) using large language models (LLMs) has enabled the creation of AI-based virtual patients that both converse with learners and automatically evaluate performance [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Empirical studies have shown promising results for AI assessment in free-text clinical documentation [<xref ref-type="bibr" rid="ref6">6</xref>], script concordance testing [<xref ref-type="bibr" rid="ref7">7</xref>], and objective structured clinical examination (OSCE) history-taking stations [<xref ref-type="bibr" rid="ref8">8</xref>]. Many of these systems use validated rubrics such as the Master Interview Rating Scale (MIRS) to structure feedback [<xref ref-type="bibr" rid="ref9">9</xref>]. Nevertheless, the reliability and validity of AI-generated ratings remain understudied; therefore, establishing concordance with expert evaluations is a prerequisite for educational or licensure use.</p></sec><sec id="s1-2"><title>Objectives</title><p>This study compared AI-based assessment (ABA) scores of clinical interview performance using GPT-o1 Pro (OpenAI; ABA-o1) and GPT-5 Pro (OpenAI; ABA-5) with human-based assessment (HBA) scores. We hypothesized that ABA scores and HBA scores would exhibit strong concordance and that ABA scoring would serve as a substitute for HBA scoring. We also hypothesized that AI would complete evaluations more rapidly, reducing the assessment burden on clinicians. A secondary aim was to evaluate agreement across participants with differing clinical experience and evaluate whether the use of AI could lead to a measurable reduction in evaluation time, thereby contributing to overall efficiency in assessment processes.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Setting</title><p>A cross-sectional validation study was conducted. This study involved 3 medical students (MSs), 3 RPs, and 2 attending physicians (APs) who participated in standardized clinical scenarios.</p></sec><sec id="s2-2"><title>Virtual Patient Scenario</title><p>A man aged 27 years presenting with progressive bilateral leg weakness, particularly proximal, was scripted based on a published case of thyrotoxic periodic paralysis. The scenario, created by a general internal medicine specialist with extensive educational experience drawing directly on prior literature, included relevant clinical history (eg, recent myalgias, tremors, diarrhea, and insomnia), red flag cues (eg, acute onset, muscle weakness, and hypokalemia), and psychosocial factors (eg, recent immigration and use of herbal supplements). The case represented a classic presentation of thyrotoxic periodic paralysis caused by hyperthyroidism. The patient was implemented as an AI-simulated character using ChatGPT&#x2019;s custom generative pretrained transformers.</p></sec><sec id="s2-3"><title>Participants</title><p>The participants were recruited through convenience sampling complemented by snowball sampling. MSs comprised a third-year and a fifth-year student; RPs comprised 3 postgraduate year 1 residents; and APs comprised 2 board-certified physicians in internal medicine or general internal medicine in Japan, each with &#x2265;5 years of clinical teaching experience.</p><p>Each participant conducted a history-taking encounter by speaking with an AI patient. All conversations were recorded and transcribed verbatim. As all interviews took place within the ChatGPT-based simulated patient interface, transcripts were automatically generated from dialogue logs without manual correction.</p></sec><sec id="s2-4"><title>Scoring Instrument</title><p>The MIRS from the University of Tennessee was originally designed to assess 27 items. In this study, 25 of these items were evaluated based on the available conversational recordings. Each item was rated on a scale from 0 to 5 (total possible score: 0&#x2010;125) covering domains such as information gathering, organization, empathy, and patient-centered communication. The excluded items were nonverbal behavior and pace and flow of the interview, which require audiovisual input to evaluate.</p></sec><sec id="s2-5"><title>Assessment Methods</title><p>The main outcome was the comparison of MIRS scores.</p><p>For ABA-o1, each transcript was submitted separately to GPT-o1 Pro with a base prompt directing it to rate the encounter using the MIRS and justify each score. This process was repeated 5 times per transcript, and item-level and total scores were averaged across runs. For ABA-5, using the same base prompt, the 7 transcripts were scored in 2 batch submissions rather than individually. Run 1 included MS 1, MS 2, RP 1, and RP 2, and run 2 included RP 3, AP 1, and AP 2. For each batch, the prompt explicitly stated that it contained 4 interview transcripts (run 1) or 3 interview transcripts (run 2). For each participant within a batch, item-level and total MIRS scores were extracted from the model&#x2019;s output. The base prompt and model settings were held constant across runs, and the full prompt is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> to support reproducibility. For both ABA-o1 and ABA-5, reproducibility was further examined by scoring each transcript 5 times using different random seeds.</p><p>For HBA, 5 blinded clinical instructors independently rated each transcript using the same MIRS rubric. All assessors were board certified in general internal medicine or general medicine in Japan, actively involved in medical education, and coauthors of this study (Y Tokushima, TS, RW, TM, and FS). Item-level and total scores were averaged across the 5 raters. To further ensure calibration beyond the preparatory webinar, raters briefly discussed scoring rationales for sample transcripts to reach consensus on the interpretation of rubric items.</p><p>The secondary outcome was the comparison of assessment time, which was assessed as follows:</p><list list-type="bullet"><list-item><p>Physician scoring time for HBA&#x2014;a stopwatch measured the time from transcript review to completion of scoring.</p></list-item><list-item><p>AI scoring time for ABA-o1&#x2014;the elapsed time was automatically recorded for each of the 7 individual submissions from prompt submission to receipt of the complete output.</p></list-item><list-item><p>AI scoring time for ABA-5&#x2014;the elapsed time was automatically recorded for each of the 2 batch submissions from prompt submission to receipt of the complete output.</p></list-item></list><p>For all 3 methods, mean assessment time and SD were calculated, and absolute and relative time savings of ABA vs HBA were reported.</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>All analyses used R (version 4.3.1; R Foundation for Statistical Computing). Descriptive statistics (mean and SD and coefficient of variation [CV]) summarized the scores. Agreement was assessed using the Pearson correlation coefficient (<italic>r</italic>) for linear associations; the Lin concordance correlation coefficient (CCC) for both correlation and bias, summarizing overall agreement in a single index; and Bland-Altman analysis for bias and limits of agreement (LoA).</p><p>Reliability metrics included the Cronbach &#x03B1; for internal consistency, and intraclass correlation coefficients (ICCs) were calculated to quantify (1) repeatability across the 5 independent GPT-o1 Pro and GPT-5 Pro runs (stability of scores when the same model was applied repeatedly to the same transcript) and (2) interrater reliability across the 5 physician raters (agreement among different human raters). A 2-sided &#x03B1; of &#x003C;.05 denoted significance.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>Ethics approval was obtained from the Juntendo University institutional review board (approval E24-0314-U02). All participants provided written informed consent before taking part. To protect participants&#x2019; privacy and confidentiality, all interview transcripts and performance scores were deidentified prior to evaluation and analysis by assigning study IDs and removing any potentially identifying information. Only deidentified transcripts were shared with the physician raters, and results are reported in aggregate. Study data were stored on password-protected, access-restricted institutional systems, and only the research team had access. Participants received no financial compensation for participation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Participant Scores</title><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes the interview scores obtained via ABA-o1, ABA-5, and HBA. Across all 7 participants, group-level means were 53.7 (SD 6.8) for HBA, 53.2 (SD 9.2) for ABA-5, and 52.1 (SD 6.9) for ABA-01. Within-participant variability (mean CV percentage) was similar for the 2 automated methods (ABA-o1=6.6%; ABA-5=6.6%) and higher for HBA (13.9%). Individual-level differences were generally small, although notable divergences arose for RP 2 when comparing HBA vs ABA-o1 (46.8 vs 53.4; &#x0394;=6.6) and for AP 2 when comparing ABA-5 vs HBA (67.8 vs 58.8; &#x0394;=9.0) and ABA-5 vs ABA-o1 (67.8 vs 55.6; &#x0394;=12.2).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Mean scores by method and participant (n=7).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Participant</td><td align="left" valign="bottom" colspan="2">HBA<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="2">ABA-o1<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="2">ABA-5<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top">Score (0-125), mean (SD)</td><td align="left" valign="top">CV<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> (%)</td><td align="left" valign="top">Score (0-125), mean (SD)</td><td align="left" valign="top">CV (%)</td><td align="left" valign="top">Score (0-125), mean (SD)</td><td align="left" valign="top">CV (%)</td></tr><tr><td align="left" valign="top">Medical student 1</td><td align="left" valign="top">48.0 (8.9)</td><td align="left" valign="top">18.5</td><td align="left" valign="top">46.4 (2.4)</td><td align="char" char="." valign="top">5.2</td><td align="left" valign="top">46.0 (1.9)</td><td align="left" valign="top">4.1</td></tr><tr><td align="left" valign="top">Medical student 2</td><td align="left" valign="top">65.0 (9.7)</td><td align="left" valign="top">15.0</td><td align="left" valign="top">63.6 (5.1)</td><td align="char" char="." valign="top">8.1</td><td align="left" valign="top">64.6 (4.2)</td><td align="left" valign="top">6.5</td></tr><tr><td align="left" valign="top">Resident physician 1</td><td align="left" valign="top">47.0 (2.9)</td><td align="left" valign="top">6.2</td><td align="left" valign="top">46.8 (2.9)</td><td align="char" char="." valign="top">6.1</td><td align="left" valign="top">50.0 (2.6)</td><td align="left" valign="top">5.3</td></tr><tr><td align="left" valign="top">Resident physician 2</td><td align="left" valign="top">53.4 (7.2)</td><td align="left" valign="top">13.4</td><td align="left" valign="top">46.8 (3.3)</td><td align="char" char="." valign="top">7.2</td><td align="left" valign="top">51.0 (7.1)</td><td align="left" valign="top">14.0</td></tr><tr><td align="left" valign="top">Resident physician 3</td><td align="left" valign="top">47.2 (3.6)</td><td align="left" valign="top">7.6</td><td align="left" valign="top">47.6 (2.7)</td><td align="char" char="." valign="top">5.7</td><td align="left" valign="top">44.0 (1.0)</td><td align="left" valign="top">2.3</td></tr><tr><td align="left" valign="top">Attending physician 1</td><td align="left" valign="top">56.4 (9.4)</td><td align="left" valign="top">16.7</td><td align="left" valign="top">58.0 (5.4)</td><td align="char" char="." valign="top">9.3</td><td align="left" valign="top">49.2 (2.6)</td><td align="left" valign="top">5.3</td></tr><tr><td align="left" valign="top">Attending physician 2</td><td align="left" valign="top">58.8 (11.7)</td><td align="left" valign="top">19.8</td><td align="left" valign="top">55.6 (2.7)</td><td align="char" char="." valign="top">4.9</td><td align="left" valign="top">67.8 (6.2)</td><td align="left" valign="top">9.1</td></tr><tr><td align="left" valign="top">All</td><td align="left" valign="top">53.7 (6.8)</td><td align="left" valign="top">13.9</td><td align="left" valign="top">52.1 (6.9)</td><td align="char" char="." valign="top">6.6</td><td align="left" valign="top">53.2 (9.2)</td><td align="left" valign="top">6.6</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>HBA: human-based assessment.</p></fn><fn id="table1fn2"><p><sup>b</sup>ABA-o1: artificial intelligence&#x2013;based assessment (ABA) using GPT-o1 Pro.</p></fn><fn id="table1fn3"><p><sup>c</sup>ABA-5: ABA using GPT-5 Pro.</p></fn><fn id="table1fn4"><p><sup>d</sup>CV: coefficient of variation.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Agreement and Reliability Across ABA-o1, ABA-5, and HBA</title><p>Agreement and reliability were evaluated across the 3 rating methods (ABA-o1, ABA-5, and HBA). Pairwise concordance with HBA was high for both AI variants: ABA-o1 vs HBA showed a Pearson correlation coefficient (<italic>r</italic>) of 0.90 (95% CI 0.78&#x2010;0.96) and CCC of 0.88; ABA-5 vs HBA showed an <italic>r</italic> of 0.87 (95% CI 0.72&#x2010;0.94) and CCC of 0.86. Concordance between the 2 AI pipelines was the highest (ABA-o1 vs ABA-5: <italic>r</italic>=0.98, 95% CI 0.95&#x2010;0.99; CCC=0.98), indicating near interchangeability of the AI variants (<xref ref-type="table" rid="table2">Table 2</xref>). Internal consistency followed the same pattern: Cronbach &#x03B1; was 0.81, 0.86, and 0.80 for ABA-o1, ABA-5, and HBA, respectively.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Correlation, concordance, and internal consistency between artificial intelligence&#x2013;based assessment (ABA) and human-based assessment (HBA) scores. Higher values indicate stronger association or consistency.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Comparison</td><td align="left" valign="bottom">Number of items</td><td align="left" valign="bottom">Pearson <italic>r</italic> (95% CI)</td><td align="left" valign="bottom">Lin CCC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">ABA-o1<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> vs HBA</td><td align="left" valign="top">25</td><td align="left" valign="top">0.90 (0.78&#x2010;0.96)</td><td align="left" valign="top">0.88</td></tr><tr><td align="left" valign="top">ABA-5<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> vs HBA</td><td align="left" valign="top">25</td><td align="left" valign="top">0.87 (0.72&#x2010;0.94)</td><td align="left" valign="top">0.86</td></tr><tr><td align="left" valign="top">ABA-o1 vs ABA-5</td><td align="left" valign="top">25</td><td align="left" valign="top">0.98 (0.95&#x2010;0.99)</td><td align="left" valign="top">0.98</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table2fn2"><p><sup>b</sup>ABA-o1: ABA using GPT-o1 Pro.</p></fn><fn id="table2fn3"><p><sup>c</sup>ABA-5: ABA using GPT-5 Pro.</p></fn></table-wrap-foot></table-wrap><p>All correlations were significant (2-sided <italic>P</italic>&#x003C;.001). Bland-Altman analyses comparing each ABA with HBA showed small positive mean biases (ABA-o1 vs HBA: +0.43 [SD of differences 2.70]; ABA-5 vs HBA: +1.54[SD of differences 5.17]), with 95% LoA of &#x2212;4.87 to 5.72 and &#x2212;8.60 to 11.68, respectively; no proportional bias was observed in either comparison (<xref ref-type="fig" rid="figure1">Figures 1A and 1B</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Bland-Altman plots comparing artificial intelligence&#x2013;based assessment (ABA) with human-based assessment (HBA): (A) ABA using GPT-o1 Pro (ABA-o1) vs HBA (mean bias 0.43 [SD of differences 2.70]; limits of agreement [LoA]=&#x2212;4.87 to 5.72) and (B) ABA using GPT-5 Pro (ABA-5) vs HBA (mean bias 1.54 [SD of differences 5.17]; LoA=&#x2212;8.60 to 11.68). The points indicate participants (&#x00D7;). The solid line shows the mean bias; the dashed lines indicate the LoA.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e81673_fig01.png"/></fig><p>Repeatability was assessed using the ICC. ABA-o1 showed substantial repeatability across 5 independent runs (ICC(3,1)=0.77; ICC(3,5)=0.94), and ABA-5 likewise showed substantial repeatability (ICC(3,1)=0.82; ICC(3,5)=0.96). In contrast, interrater reliability among the 5 HBA physician raters was only fair on single measures (ICC(2,1)=0.38) and improved when averaging them (ICC(2,5)=0.75). Overall, both AI-based approaches yielded more stable ratings across repeated evaluations than HBA, with ABA-5 slightly more stable than ABA-o1.</p></sec><sec id="s3-3"><title>Scores by Training Level</title><p><xref ref-type="table" rid="table3">Table 3</xref> summarizes mean interview scores and SDs by training level. Across methods, APs had the highest means (HBA: 57.6, SD 1.7; ABA-o1: 56.8, SD 1.7; ABA-5: 58.5, SD 13.2). MSs were next (HBA: 56.5, SD 12.0; ABA-o1: 55.0, SD 12.2; ABA-5: 55.3, SD 13.2), in some cases approximating AP performance. RPs had the lowest means (HBA: 49.2, SD 3.6; ABA-o1: 47.1, SD 0.5; ABA-5: 48.3, SD 3.8). Therefore, the anticipated ordinal pattern (APs&#x003E;RPs&#x003E;MSs) was not consistently observed as MS means exceeded RP means across all methods.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Mean interview scores by training level as rated via human-based assessment (HBA) and artificial intelligence&#x2013;based assessment (ABA).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Group</td><td align="left" valign="bottom">Participants per group, n</td><td align="left" valign="bottom">HBA score (0-125), mean (SD)</td><td align="left" valign="bottom">ABA-o1<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> score (0-125), mean (SD)</td><td align="left" valign="bottom">ABA-5<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> score (0-125), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Attending physicians</td><td align="left" valign="top">2</td><td align="left" valign="top">57.6 (1.7)</td><td align="left" valign="top">56.8 (1.7)</td><td align="left" valign="top">58.5 (13.2)</td></tr><tr><td align="left" valign="top">Medical students</td><td align="left" valign="top">2</td><td align="left" valign="top">56.5 (12.0)</td><td align="left" valign="top">55.0 (12.2)</td><td align="left" valign="top">55.3 (13.2)</td></tr><tr><td align="left" valign="top">Resident physicians</td><td align="left" valign="top">3</td><td align="left" valign="top">49.2 (3.6)</td><td align="left" valign="top">47.1 (0.5)</td><td align="left" valign="top">48.3 (3.8)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ABA-o1: ABA using GPT-o1 Pro.</p></fn><fn id="table3fn2"><p><sup>b</sup>ABA-5: ABA using GPT-5 Pro.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Processing Time (35 Cases)</title><p>Total processing time was 5 hours, 59 minutes, 35 seconds for the physician benchmark; 1 hour, 56 minutes, 38 seconds for ABA-5; and 2 hours, 31 minutes, 5 seconds for ABA-o1. Average time per case was 3 minutes, 19.9 seconds for ABA-5 (batch-to-batch SD 1 minute, 6 seconds); 4 minutes, 19 seconds for ABA-o1 (SD 3 minutes, 9 seconds); and 10 minutes, 16.4 seconds for the physicians (SD 11 minutes, 9 seconds). Relative to the physicians, total time was reduced by 67.6% with ABA-5 and 58% with ABA-o1 (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Analysis time by method (5 independent runs and raters per method). &#x201C;Batch-to-batch SD&#x201D; indicates across-run variability. &#x201C;Time reduction vs physicians&#x201D; indicates the percentage reduction relative to human-based assessment (HBA).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Total time</td><td align="left" valign="bottom">Mean time per case (batch-to-batch SD)</td><td align="left" valign="bottom">Time reduction vs physicians (%)</td></tr></thead><tbody><tr><td align="left" valign="top">ABA-5<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">1 h, 56 min, 38 s</td><td align="left" valign="top">3 min, 20 s (1 min, 6 s)</td><td align="left" valign="top">67.6</td></tr><tr><td align="left" valign="top">ABA-o1<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">2 h, 31 min, 5 s</td><td align="left" valign="top">4 min, 19 s (3 min, 9 s)</td><td align="left" valign="top">58.0</td></tr><tr><td align="left" valign="top">HBA</td><td align="left" valign="top">5 h, 59 min, 35 s</td><td align="left" valign="top">10 min, 16 s (11 min, 9 s)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>ABA-5: artificial intelligence&#x2013;based assessment (ABA) using GPT-5 Pro.</p></fn><fn id="table4fn2"><p><sup>b</sup>ABA-o1: ABA using GPT-o1 Pro.</p></fn><fn id="table4fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this validation study comparing 3 rater groups (HBA, ABA-o1, and ABA-5), ABA-o1 and ABA-5 produced interview ratings that were statistically indistinguishable from those produced via HBA yet showed markedly superior psychometric stability relative to HBA (Cronbach &#x03B1;: ABA-o1=0.81, ABA-5=0.86, and HBA=0.80; ICC: ABA-o1=0.77, ABA-5=0.82, and HBA=0.38). Cronbach &#x03B1; values of &#x2265;0.8 indicate good internal consistency [<xref ref-type="bibr" rid="ref10">10</xref>], and ICC(2,1) values of &#x2265;0.75 denote good interrater reliability [<xref ref-type="bibr" rid="ref11">11</xref>]. Agreement metrics were likewise robust as evaluative tools: the CCC assesses both correlation and bias in a single index [<xref ref-type="bibr" rid="ref12">12</xref>], whereas Bland-Altman analysis remains the standard for visualizing bias and LoA [<xref ref-type="bibr" rid="ref13">13</xref>]. ABA-5 was benchmarked against HBA using the same agreement framework.</p><p>Although the observed differences in reliability were significant, they may also have practical implications in educational settings. The consistently higher internal consistency and interrater reliability suggest that ABA scoring (including ABA-o1 and ABA-5) could enhance assessment efficiency and reproducibility. Depending on the context, ABA may serve not only as a scalable adjunct but also as a viable alternative to human raters in transcript-based clinical interview evaluations, although this requires significant larger-scale validation.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>These findings corroborate previous work in which LLMs matched or exceeded faculty performance when scoring free-text notes [<xref ref-type="bibr" rid="ref6">6</xref>], designing script concordance tests [<xref ref-type="bibr" rid="ref7">7</xref>], and evaluating OSCE encounters [<xref ref-type="bibr" rid="ref8">8</xref>]. A recent study showed that GPT-4o can produce inpatient documentation of comparable quality to that produced by resident physicians while reducing charting time by &#x003E;50% [<xref ref-type="bibr" rid="ref14">14</xref>]. In particular, studies of OSCE history-taking and free-text documentation have begun to demonstrate that LLM-based raters can apply communication-focused rubrics in virtual or simulated encounters with performance comparable to that of trained faculty, underscoring the relevance of AI-supported assessment in simulation-based learning contexts [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Consistent with ChatGPT&#x2019;s passing performance on the US Medical Licensing Examination [<xref ref-type="bibr" rid="ref15">15</xref>], this study suggests that foundation models possess clinically relevant semantic competence even in spoken communication tasks. Moreover, the 58% reduction in analysis time mirrors the 2025 Time for Class survey, where 36% of faculty who used generative AI daily reported a measurable workload decrease [<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>Beyond efficiency, such time savings could play a decisive role in addressing the growing problem of clinician educator burnout and faculty shortages, which are societal challenges that threaten the sustainability of CBME [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. These concerns mirror prior reports that CBME assessment documentation requires several minutes per form and that the cumulative &#x201C;assessment burden&#x201D; is perceived as a threat to program sustainability [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. By automating labor-intensive scoring, AI can free physicians to devote more time to high-value coaching and mentorship, thereby enhancing both educator well-being and learner support [<xref ref-type="bibr" rid="ref17">17</xref>]. Furthermore, the superior scoring consistency observed with LLMs may help curb rater drift and cognitive biases such as leniency, halo, or contrast effects, which multicenter OSCE analyses have identified as long-recognized sources of unreliability and examiner-related variance in workplace-based assessments [<xref ref-type="bibr" rid="ref19">19</xref>]. Improved fairness and reliability in assessment would advance equity in trainee progression and, ultimately, foster a more competent, patient-centered workforce.</p></sec><sec id="s4-3"><title>Interpretation and Educational Implications</title><p>From an educational perspective, 3 observations are noteworthy when framed across the 3 rater groups (HBA, ABA-o1, and ABA-5).</p><sec id="s4-3-1"><title>Consistency vs Nuance</title><p>The score distributions for ABA-o1 and ABA-5 suggest that these models apply the rubric more consistently than HBA raters, likely because their underlying embeddings execute the criteria more deterministically once sampling stochasticity is averaged across runs. Consistency is a hallmark of fair assessment; however, the absence of human nuance in ABA-o1 and ABA-5 could miss contextual subtleties (eg, cultural cues and atypical communication styles) that HBA raters may detect. Such subtleties may include culturally patterned ways of showing respect or disagreement, indirect or high-context communication, and unconventional but effective rapport-building strategies that are difficult to fully capture in a text-based rubric. Accordingly, this balance between reproducibility and subtlety is central when integrating ABA into educational programs; in our view, ABA is best used to enhance reproducibility and efficiency, with human raters remaining essential for high-stakes decisions and for cases in which subtle contextual factors are educationally or ethically salient.</p></sec><sec id="s4-3-2"><title>Efficiency Gains</title><p>Relative to HBA (10 minutes, 16 seconds per case), ABA-5 and ABA-o1 reduced analytic time to 3 minutes, 20 seconds (&#x2013;67.6%) and 4 minutes, 19 seconds (&#x2013;58%) per case, respectively, amounting to approximately 240 and 210 faculty minutes saved across 35 encounters, respectively. In throughput terms, this corresponds to an increase in throughput from approximately 6 cases per hour with HBA to 18 cases per hour with ABA-5 and 14 cases per hour with ABA-o1, supporting more timely formative feedback and enabling the reinvestment of AI-derived efficiency gains into coaching rather than grading. When viewed alongside the lower CV and higher reliability indexes for AI-based scoring, these efficiency gains suggest that ABA could support more consistent and sustainable assessment practices within CBME frameworks [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. In addition, ABA-5 could process data for 3 to 4 individuals in a single run, reducing the need for repeated prompt inputs and minimizing data handling overhead.</p></sec><sec id="s4-3-3"><title>Level-Based Performance</title><p>MSs outperformed RPs on the same rubric in this cohort. This pattern may reflect (1) sampling error in a modest cohort, (2) case specificity favoring recently studied content, and/or (3) a rubric that emphasizes foundational communication more than advanced clinical reasoning. In particular, the MIRS prioritizes patient-centered communication behaviors that are heavily emphasized in undergraduate curricula and may be less sensitive to more advanced diagnostic reasoning skills typically developed during residency. Given the small number of participants and the single standardized case focused on a classic thyrotoxic periodic paralysis presentation, this unexpected pattern should be interpreted as a context-bound, hypothesis-generating finding rather than evidence that MSs generally outperform RPs in broader clinical performance. Replication with larger, more varied case sets and tiered rubrics evaluated across HBA, ABA-o1, and ABA-5 is warranted.</p></sec><sec id="s4-3-4"><title>Practical Implications</title><p>Practically, programs could deploy an &#x201C;AI-first, faculty-verified&#x201D; workflow in which ABA-o1 and ABA-5 provide rapid formative scores and narrative feedback immediately after an encounter and HBA then audits a random subset for quality assurance, similar to double reading in radiology. Such hybridity leverages the speed and reliability of LLMs while retaining human oversight for high-stakes decisions.</p></sec></sec><sec id="s4-4"><title>Strengths and Limitations</title><p>A key strength is the dual evaluation of accuracy (agreement) and efficiency (time), providing a more complete picture of implementation value than through accuracy alone. Nonetheless, several limitations warrant caution:</p><p>First, only 7 participants and a single thyrotoxic periodic paralysis scenario were tested, limiting generalizability across learner levels, languages, and clinical contexts. The small and homogeneous sample also restricts the psychometric interpretation of the findings; for example, differences in learner experience, case complexity, and language environment may influence both human and AI scoring behaviors. Therefore, these results should be viewed as preliminary and hypothesis generating rather than confirmatory. Second, convenience sampling and self-selection may have introduced bias toward technology-friendly participants. Third, model and prompt dependence was a limitation; the results correspond to GPT-o1 Pro and GPT-5 Pro with a specific rubric prompt; other LLMs or prompt engineering strategies could alter performance. Fourth, speech-to-text errors were not exhaustively audited and may have influenced ratings. In addition, the evaluation was limited to transcribed textual data; nonverbal cues, vocal tone, and conversational pauses present in the actual interviews could not be assessed. Fifth, there was potentially a systemic bias. High concordance does not preclude shared cognitive blind spots between AI and human raters; fairness audits across sex, accent, and cultural communication styles remain necessary. In practical implementations, this would entail periodic subgroup analyses of score distributions, qualitative review of discrepant cases, and predefined procedures for pausing or adjusting AI-based scoring if systematic disparities are detected.</p></sec><sec id="s4-5"><title>Future Research</title><p>Future studies should (1) evaluate multiple diverse clinical scenarios, including psychosocially complex cases; (2) compare real-time vs postencounter AI feedback; (3) examine learner outcomes such as skill acquisition and satisfaction; (4) conduct cost-effectiveness analyses at scale; and (5) develop and evaluate bias mitigation and explainability techniques&#x2014;such as routine fairness dashboards, scheduled revalidation against human ratings, and faculty-led oversight processes&#x2014;to satisfy accreditation requirements.</p><p>As this study was limited to transcript-based assessments of simulated encounters, future work is also needed to evaluate how well ABA scores correlate with actual clinical performance and whether AI can reduce assessor burden while maintaining fairness and reliability.</p></sec><sec id="s4-6"><title>Conclusions</title><p>Within the constraints of this pilot, GPT-o1 Pro and GPT-5 Pro matched expert physicians in scoring simulated patient interviews; produced more reliable ratings; and delivered a substantial 58% and 67.6% reduction in analytical time, respectively. These preliminary results indicate that LLMs could serve as a complementary or alternative tool to human raters for transcript-based interview assessments. This approach warrants further investigation as a means to contribute to assessment efficiency in medical education. Careful curricular design and continuous human oversight will be essential to ensure that such tools enhance rather than compromise the validity and equity of learner evaluations.</p></sec></sec></body><back><ack><p>The authors would like to thank the clinical instructors for their time and the participants for engaging in this educational pilot. The authors confirm that ChatGPT (OpenAI) was used during the study and manuscript preparation. Specifically, ChatGPT was used to assist with structuring and organizing the manuscript text (eg, improving clarity and logical flow of sentences and paragraphs) and the evaluation of medical interview transcripts according to predefined assessment criteria. All scientific content, study design, evaluation frameworks, data interpretation, and final editorial decisions were determined and verified by the authors, who take full responsibility for the content of the manuscript.</p></ack><notes><sec><title>Funding</title><p>This study was supported by Japan Society for the Promotion of Science Grant-in-Aid for Scientific Research JP23K05953; the Japan Medical Education Foundation research grant for financial year 2025; and the "Creating training hubs for advanced medical personnel" program from the Ministry of Education, Culture, Sports, Science, and Technology of Japan.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ABA</term><def><p>artificial intelligence&#x2013;based assessment</p></def></def-item><def-item><term id="abb2">ABA-5</term><def><p>artificial intelligence&#x2013;based assessment using GPT-5 Pro</p></def></def-item><def-item><term id="abb3">ABA-o1</term><def><p>artificial intelligence&#x2013;based assessment using GPT-o1 Pro</p></def></def-item><def-item><term id="abb4">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb5">AP</term><def><p>attending physician</p></def></def-item><def-item><term id="abb6">CBME</term><def><p>competency-based medical education</p></def></def-item><def-item><term id="abb7">CCC</term><def><p>concordance correlation coefficient</p></def></def-item><def-item><term id="abb8">CV</term><def><p>coefficient of variation</p></def></def-item><def-item><term id="abb9">HBA</term><def><p>human-based assessment</p></def></def-item><def-item><term id="abb10">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb11">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb12">LoA</term><def><p>limits of agreement</p></def></def-item><def-item><term id="abb13">MIRS</term><def><p>Master Interview Rating Scale</p></def></def-item><def-item><term id="abb14">MS</term><def><p>medical student</p></def></def-item><def-item><term id="abb15">OSCE</term><def><p>objective structured clinical examination</p></def></def-item><def-item><term id="abb16">RP</term><def><p>resident physician</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Talwalkar</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Fortin</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Morrison</surname><given-names>LJ</given-names> </name><etal/></person-group><article-title>An advanced communication skills workshop using standardized patients for senior medical students</article-title><source>MedEdPORTAL</source><year>2021</year><month>05</month><day>27</day><volume>17</volume><fpage>11163</fpage><pub-id pub-id-type="doi">10.15766/mep_2374-8265.11163</pub-id><pub-id pub-id-type="medline">34124349</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cook</surname><given-names>DA</given-names> </name></person-group><article-title>Creating virtual patients using large language models: scalable, global, and low cost</article-title><source>Med Teach</source><year>2025</year><month>01</month><volume>47</volume><issue>1</issue><fpage>40</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2024.2376879</pub-id><pub-id pub-id-type="medline">38992981</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheung</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rogoza</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Kwan</surname><given-names>BYM</given-names> </name></person-group><article-title>Analyzing the administrative burden of competency based medical education</article-title><source>Can Assoc Radiol J</source><year>2022</year><month>05</month><volume>73</volume><issue>2</issue><fpage>299</fpage><lpage>304</lpage><pub-id pub-id-type="doi">10.1177/08465371211038963</pub-id><pub-id pub-id-type="medline">34449283</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Szulewski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Braund</surname><given-names>H</given-names> </name><name name-style="western"><surname>Dagnone</surname><given-names>DJ</given-names> </name><etal/></person-group><article-title>The assessment burden in competency-based medical education: how programs are adapting</article-title><source>Acad Med</source><year>2023</year><month>11</month><day>1</day><volume>98</volume><issue>11</issue><fpage>1261</fpage><lpage>1267</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005305</pub-id><pub-id pub-id-type="medline">37343164</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takahashi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Shikino</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kondo</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Educational utility of clinical vignettes generated in Japanese by ChatGPT-4: mixed methods study</article-title><source>JMIR Med Educ</source><year>2024</year><month>08</month><day>13</day><volume>10</volume><fpage>e59133</fpage><pub-id pub-id-type="doi">10.2196/59133</pub-id><pub-id pub-id-type="medline">39137031</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burke</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Hoang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lopreiato</surname><given-names>JO</given-names> </name><etal/></person-group><article-title>Assessing the ability of a large language model to score free-text medical student clinical notes: quantitative study</article-title><source>JMIR Med Educ</source><year>2024</year><month>07</month><day>25</day><volume>10</volume><fpage>e56342</fpage><pub-id pub-id-type="doi">10.2196/56342</pub-id><pub-id pub-id-type="medline">39118469</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hudon</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kiepura</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pelletier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Phan</surname><given-names>V</given-names> </name></person-group><article-title>Using ChatGPT in psychiatry to design script concordance tests in undergraduate medical education: mixed methods study</article-title><source>JMIR Med Educ</source><year>2024</year><month>04</month><day>4</day><volume>10</volume><fpage>e54067</fpage><pub-id pub-id-type="doi">10.2196/54067</pub-id><pub-id pub-id-type="medline">38596832</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>PH</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>YC</given-names> </name></person-group><article-title>Performance comparison of junior residents and ChatGPT in the objective structured clinical examination (OSCE) for medical history taking and documentation of medical records: development and usability study</article-title><source>JMIR Med Educ</source><year>2024</year><month>11</month><day>21</day><volume>10</volume><fpage>e59902</fpage><pub-id pub-id-type="doi">10.2196/59902</pub-id><pub-id pub-id-type="medline">39622713</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><source>Master interview rating scale (MIRS)</source><year>2003</year><access-date>2025-07-23</access-date><publisher-name>Eastern Virginia Medical School</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.dmu.edu/wp-content/uploads/Master-Interview-Rating-Scale.pdf">https://www.dmu.edu/wp-content/uploads/Master-Interview-Rating-Scale.pdf</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tavakol</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dennick</surname><given-names>R</given-names> </name></person-group><article-title>Making sense of Cronbach&#x2019;s alpha</article-title><source>Int J Med Educ</source><year>2011</year><month>06</month><day>27</day><volume>2</volume><fpage>53</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.5116/ijme.4dfb.8dfd</pub-id><pub-id pub-id-type="medline">28029643</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>LI</given-names> </name></person-group><article-title>A concordance correlation coefficient to evaluate reproducibility</article-title><source>Biometrics</source><year>1989</year><month>03</month><volume>45</volume><issue>1</issue><fpage>255</fpage><lpage>268</lpage><pub-id pub-id-type="doi">10.2307/2532051</pub-id><pub-id pub-id-type="medline">2720055</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bland</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name></person-group><article-title>Statistical methods for assessing agreement between two methods of clinical measurement</article-title><source>Lancet</source><year>1986</year><month>02</month><day>8</day><volume>1</volume><issue>8476</issue><fpage>307</fpage><lpage>310</lpage><pub-id pub-id-type="medline">2868172</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Comparison of medical history documentation efficiency and quality based on GPT-4o: a study on the comparison between residents and artificial intelligence</article-title><source>Front Med (Lausanne)</source><year>2025</year><volume>12</volume><fpage>1545730</fpage><pub-id pub-id-type="doi">10.3389/fmed.2025.1545730</pub-id><pub-id pub-id-type="medline">40438356</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Time for class 2025 report: daily AI use linked to reduced faculty workload</article-title><source>D2L Corporation</source><year>2025</year><access-date>2025-07-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.d2l.com/newsroom/tyton_partners_report_examines_ai_in_higher_education/">https://www.d2l.com/newsroom/tyton_partners_report_examines_ai_in_higher_education/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Brzezinski</surname><given-names>M</given-names> </name><name name-style="western"><surname>DePorre</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ballard</surname><given-names>HA</given-names> </name></person-group><article-title>Burnout in academic physicians</article-title><source>Perm J</source><year>2023</year><month>06</month><day>15</day><volume>27</volume><issue>2</issue><fpage>142</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.7812/TPP/23.032</pub-id><pub-id pub-id-type="medline">37309180</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>AAMC applauds introduction of bill to reduce physician shortage</article-title><source>Association of American Medical Colleges</source><year>2025</year><access-date>2025-07-29</access-date><publisher-name>AAMC</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.aamc.org/news/press-releases/aamc-applauds-introduction-bill-reduce-physician-shortage-0">https://www.aamc.org/news/press-releases/aamc-applauds-introduction-bill-reduce-physician-shortage-0</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yeates</surname><given-names>P</given-names> </name><name name-style="western"><surname>McCray</surname><given-names>G</given-names> </name></person-group><article-title>Investigating the accuracy of adjusting for examiner differences in multicentre objective structured clinical exams (OSCEs): a simulation study of video-based examiner score comparison and adjustment (VESCA)</article-title><source>BMC Med Educ</source><year>2024</year><month>12</month><day>18</day><volume>24</volume><issue>1</issue><fpage>1466</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-06462-3</pub-id><pub-id pub-id-type="medline">39695612</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt used for GPT-o1 Pro and GPT-5 Pro scoring of medical interview transcripts.</p><media xlink:href="mededu_v12i1e81673_app1.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material></app-group></back></article>