<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e83376</article-id><article-id pub-id-type="doi">10.2196/83376</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating the Pediatric Behavior Guidance of Students Based on Actual Clinical Transcripts Scored by Faculty and Large Language Models: Pilot Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Dhillon</surname><given-names>Ishreen Kaur</given-names></name><degrees>BDS, MDS</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Gabriel Keng Yan</given-names></name><degrees>BDS, MPH</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hu</surname><given-names>Shijia</given-names></name><degrees>BDS, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Faculty of Dentistry, National University of Singapore</institution><addr-line>9 Lower Kent Ridge Road</addr-line><addr-line>Singapore</addr-line><country>Singapore</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Chartash</surname><given-names>David</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Meldrum</surname><given-names>Alison Margaret</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Pang</surname><given-names>MengWei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Shijia Hu, BDS, PhD, Faculty of Dentistry, National University of Singapore, 9 Lower Kent Ridge Road, Singapore, 119085, Singapore, 65 67727757, 65 67785742; <email>denhus@nus.edu.sg</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>12</day><month>6</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e83376</elocation-id><history><date date-type="received"><day>01</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>06</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>08</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Ishreen Kaur Dhillon, Gabriel Keng Yan Lee, Shijia Hu. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 12.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2026/1/e83376"/><abstract><sec><title>Background</title><p>Personalized feedback improves the clinical pediatric behavior guidance performance of students but is prohibitively time-consuming to provide. Large language models (LLMs) can automate the process of evaluating clinical sessions but are limited to text-only input and consistency issues.</p></sec><sec><title>Objective</title><p>This study compared the use of text-only transcripts against the use of video recordings for evaluating the clinical behavior guidance performance of dental students. Additionally, the consistency and accuracy of LLMs in evaluating the transcripts were compared against a human assessor.</p></sec><sec sec-type="methods"><title>Methods</title><p>This study was conducted by using 40 video-recorded clinical encounters involving final-year dental students who were managing patients aged between 4 and 12 years at the Faculty of Dentistry, National University of Singapore. The videos were scored by using a previously validated pediatric behavior guidance scale. Clinical encounters were transcribed verbatim and scored by a study member using a modified version of the scale (nonverbal components removed). The time taken to rate the transcripts was recorded. Video scores were compared with transcript scores. Both the free-to-use version and the paid version of the ChatGPT LLM were also used to score the transcripts; consistency was evaluated and compared against the human assessor.</p></sec><sec sec-type="results"><title>Results</title><p>The average time taken to rate the transcripts (mean 12, range 3-25 min) was significantly (<italic>P</italic>&#x003C;.001) lower than the average video length (mean 73, range 37-120 min). Comparing transcript scores with video scores resulted in a consistency intraclass correlation coefficient of 0.830 (95% CI 0.679&#x2010;0.910; <italic>P</italic>&#x003C;.001), demonstrating good reliability. Comparing transcript scores with the free-to-use LLM&#x2019;s and paid LLM&#x2019;s scores yielded an absolute agreement intraclass correlation coefficient of 0.729 (95% CI 0.475&#x2010;0.859; <italic>P</italic>&#x003C;.001) and 0.670 (95% CI 0.377&#x2010;0.825; <italic>P</italic>&#x003C;.001), respectively, demonstrating moderate agreement. The LLMs were inconsistent, producing variable scores with the same prompt. The free-to-use and paid versions produced the same score for all 3 runs in only 7 (18%) and 4 (10%) of the 40 clinical encounters, respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Using transcripts to evaluate students&#x2019; clinical behavior guidance was time-saving for faculty, demonstrated good agreement with video-based evaluation, and could improve clinical teaching. Although LLMs can automate the task, improvements are needed to improve their consistency and accuracy.</p></sec></abstract><kwd-group><kwd>clinical mentoring</kwd><kwd>dental education</kwd><kwd>large language models</kwd><kwd>pediatric dentistry</kwd><kwd>dental students</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Managing pediatric dental patients is stressful for dental students and inexperienced practitioners, particularly when communicating with fearful and uncooperative children. Dental students experience 3 times the stress levels when compared to seasoned specialists [<xref ref-type="bibr" rid="ref1">1</xref>]. Although lectures and seminars deliver theoretical knowledge, they are limited in terms of actual application [<xref ref-type="bibr" rid="ref2">2</xref>]. Experiential learning during clinical sessions is essential for applying knowledge received from didactic teaching. Central to this process is guidance from faculty (ie, clinical supervisor), whose feedback helps to transform knowledge into clinical proficiency [<xref ref-type="bibr" rid="ref3">3</xref>]. Real-time faculty feedback offers the greatest potential due to the immediacy and individualized feedback provided [<xref ref-type="bibr" rid="ref4">4</xref>]. However, it is impossible to provide continuous instructional and coaching feedback for every student, as faculty often supervise multiple students during clinical sessions.</p><p>An alternative to real-time faculty feedback is the utilization of video-recorded clinical sessions, which have been shown to significantly improve dental students&#x2019; pediatric guidance scores in clinical situations [<xref ref-type="bibr" rid="ref5">5</xref>]. Although video feedback enables faculty to evaluate the entire session for each student and provide feedback targeted at processes, it is prohibitively time-consuming. Each session can last up to 90 minutes, requiring full viewing before meaningful feedback can be developed and delivered.</p><p>The advancement of artificial intelligence (AI)&#x2013;powered large language models (LLMs) allows for the evaluation of clinical interactions and generation of feedback on the improvement of pediatric behavior guidance, which can help to improve the process of teaching such guidance. Recently, an LLM was used to simulate patient-physician interactions for medical students; the clinical decision-making performance of the students who received feedback from the LLM significantly improved in subsequent patient-physician interactions when compared to that of students who did not receive any feedback [<xref ref-type="bibr" rid="ref6">6</xref>]. In another study, an LLM (ie, a chatbot) provided feedback on medical students&#x2019; history-taking performance that was comparable to feedback from human assessors [<xref ref-type="bibr" rid="ref7">7</xref>]. However, these studies were based on simulated clinical scenarios, with none examining the use of LLMs in actual clinical situations. The use of LLMs to provide feedback on behavior guidance in pediatric dentistry can address the issue of the extensive time requirement for providing personalized feedback based on video-recorded sessions&#x2014;an area that has yet to be explored.</p><p>LLMs have inherent shortcomings that may limit their ability to evaluate clinical interactions. Currently, input is limited to only text, which does not consider other important facets of communication, including tone, physical gestures, and nonverbal cues. As such, an LLM&#x2019;s ability to evaluate clinical interactions may not be wholly representative [<xref ref-type="bibr" rid="ref8">8</xref>]. Furthermore, a recent study found that using a commercially available LLM to evaluate patient risk based on computer-simulated clinical data resulted in moderately correlated (<italic>r</italic>=0.605) risk scores and only 56% agreement on a diagnosis category, calling into question the consistency of LLMs in scoring clinical interactions [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>The primary aim of this study was to determine if using text-only input was sufficient for evaluating pediatric behavior guidance performance in a clinical setting when compared to using video-recorded clinical interactions. The secondary aim was to examine the consistency and accuracy of commercially available LLMs in evaluating pediatric behavior guidance when compared against a human assessor.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study was approved by the National University of Singapore Institutional Review Board (NUS-IRB-2024&#x2010;934). This study was reported according to the Chatbot Assessment Reporting Tool (CHART) guidelines [<xref ref-type="bibr" rid="ref10">10</xref>] (<xref ref-type="supplementary-material" rid="app3">Checklist 1</xref>). Informed consent was obtained from participants. The participants were not provided with any monetary compensation. All data were deidentified before data analysis was done.</p></sec><sec id="s2-2"><title>Study Design</title><p>This study was conducted by using a set of 50 video-recorded clinical encounters involving final-year dental students who were managing pediatric patients aged between 4 and 12 years at the Faculty of Dentistry, National University of Singapore. The video-recorded clinical encounters were previously evaluated by a pediatric dentistry faculty member using a validated pediatric behavior guidance scale from an earlier study [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>The video-recorded encounters were screened, and those not conducted primarily in English were excluded from this study (8 encounters). An accredited commercial company manually transcribed the videos and annotated the speakers to produce transcripts of the encounters for evaluation. Videos with audio that was too poor in quality for accurate transcription were excluded (2 encounters). A total of 40 encounters were successfully transcribed for this study. As this was a pilot study on using clinical transcripts to score behavior guidance, no sample size calculation was conducted.</p><p>A single study member (IKD) rated the transcripts by using a modified version of the previously validated scale [<xref ref-type="bibr" rid="ref5">5</xref>], and the time taken by the human assessor to rate each encounter was recorded. The same study member randomly rerated 20% (8/40) of the transcripts, and the intrarater reliability bias&#x2013;adjusted &#x03BA; score was calculated to be 0.94. The pediatric behavior guidance scale was modified to remove nonverbal components, which are not captured with a text-only approach, giving the transcript (text-only) scale a maximum score of 15 (<xref ref-type="fig" rid="figure1">Figure 1</xref>), whereas the original scale had a maximum score of 20 (with higher scores denoting better clinical performance of behavior guidance techniques) in the original study (<xref ref-type="fig" rid="figure2">Figure 2</xref>) [<xref ref-type="bibr" rid="ref5">5</xref>]. For comparison, the video scores from the original study were similarly modified to remove nonverbal components, resulting in a maximum possible score of 15. The transcript scores were compared to the video scores and the modified video scores without nonverbal components. This helped to determine if text-only information could be used to score clinical pediatric behavior guidance performance as reliably as clinical videos.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Modified rating scale for the rating of pediatric behavior guidance performance via the transcript (text-only) approach [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e83376_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Original rating scale with nonverbal components [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e83376_fig02.png"/></fig><p>The free-to-use (GPT-4o) and paid (o4-mini-high) versions of ChatGPT (OpenAI) were guided with prompts (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) to rate behavior guidance performance by using the study rating scale and background information on pediatric behavior guidance (ie, from <italic>The Reference Manual of Pediatric Dentistry</italic> by the American Academy of Pediatric Dentistry) [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. A clinical encounter transcript was uploaded to a new chat with the same prompt, and the score was recorded. This was repeated 3 times for each clinical encounter. The scores were marked as consistent if they matched in all 3 runs; if the scores were not consistent, the score with the higher frequency (matched in 2 out of 3 runs) or the middle score was used for the final analysis. Variability was reported as the average difference and the range of the difference between the highest and lowest scores for each encounter. The free-to-use LLM&#x2019;s scores and the paid LLM&#x2019;s scores were compared to the transcript scores to determine if LLMs could perform as well as humans when scoring behavior guidance performance.</p><p>Statistical analyses were conducted by using SPSS 29.0 statistical software (IBM Corporation). Descriptive statistics were used to report the consistency of LLM scoring, variability of LLM scores, length of videos, and time taken to rate transcripts. Comparisons were conducted by using the chi-square test and Wilcoxon signed rank test. Intraclass correlation (consistency) was conducted to compare video scores against modified video scores without nonverbal components and transcript scores to determine the reliability between the scales. Intraclass correlation (absolute agreement) and interclass correlation (Pearson coefficient correlation with bias adjustment) were conducted between transcript scores, modified video scores without nonverbal components, and LLM (free-to-use and paid versions) scores to determine the agreement of the scales.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>A total of 40 clinical encounters were included in the final analysis. The average video length was 73 (range 37-120) minutes, and the average time taken to rate the transcripts was 12 (range 3-25) minutes. The time taken to rate the transcripts was significantly (<italic>P</italic>&#x003C;.001) lower than the time taken to rate the videos.</p><p>The video scores were compared to the video scores without nonverbal components, resulting in a consistency intraclass correlation coefficient (ICC) of 0.978 (95% CI 0.959&#x2010;0.988; <italic>P</italic>&#x003C;.001), which demonstrates excellent reliability.</p><p>The average video score (maximum=20) was 11.64 (SD 3.57), the average video score without nonverbal components (maximum=15) was 8.93 (SD 2.90), and the average transcript score (maximum=15) was 8.93 (SD 3.24). Comparing transcript scores to video scores resulted in a consistency ICC of 0.830 (95% CI 0.679&#x2010;0.910; <italic>P</italic>&#x003C;.001), demonstrating good reliability, and comparing transcript scores to video scores without nonverbal components resulted in an absolute agreement ICC of 0.834 (95% CI 0.684&#x2010;0.912; <italic>P</italic>&#x003C;.001), demonstrating good agreement (<xref ref-type="table" rid="table1">Table 1</xref>). The interclass correlation analysis showed similar results (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Agreement of video scores, video scores without nonverbal components, and large language model (LLM) scores when compared against transcript scores.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Average score (SD)</td><td align="left" valign="bottom">Compared with transcript scores, ICC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (95% CI; <italic>P</italic> value)</td><td align="left" valign="bottom">Level of reliability/agreement<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Transcript scores (maximum=15)</td><td align="left" valign="top">8.93 (3.24)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Video scores (maximum=20)</td><td align="left" valign="top">11.64 (3.57)</td><td align="left" valign="top">Consistency: 0.830 (0.679&#x2010;0.910; &#x003C;.001)</td><td align="left" valign="top">Good</td></tr><tr><td align="left" valign="top">Video scores without nonverbal components (maximum=15)</td><td align="left" valign="top">8.93 (2.90)</td><td align="left" valign="top">Absolute agreement: 0.834 (0.684&#x2010;0.912; &#x003C;.001)</td><td align="left" valign="top">Good</td></tr><tr><td align="left" valign="top">LLM (free-to-use) scores (maximum=15)</td><td align="left" valign="top">10.13 (3.32)</td><td align="left" valign="top">Absolute agreement: 0.729 (0.475&#x2010;0.859; &#x003C;.001)</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">LLM (paid) scores (maximum=15)</td><td align="left" valign="top">9.33 (3.14)</td><td align="left" valign="top">Absolute agreement: 0.670 (0.377&#x2010;0.825; &#x003C;.001)</td><td align="left" valign="top">Moderate</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ICC: intraclass correlation coefficient.</p></fn><fn id="table1fn2"><p><sup>b</sup>ICC ranges: above 0.90=excellent; between 0.75 and 0.90=good; between 0.50 and 0.75=moderate; below 0.50=poor [<xref ref-type="bibr" rid="ref13">13</xref>].</p></fn><fn id="table1fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>The free-to-use LLM&#x2019;s average score (maximum=15) was 10.13 (SD 3.32), and the paid LLM&#x2019;s average score (maximum=15) was 9.33 (SD 3.14). Comparing transcript scores to the free-to-use LLM&#x2019;s scores resulted in an absolute agreement ICC of 0.729 (95% CI 0.475&#x2010;0.859; <italic>P</italic>&#x003C;.001), demonstrating moderate agreement, and comparing transcript scores to the paid LLM&#x2019;s scores resulted in an absolute agreement ICC of 0.670 (95% CI 0.377&#x2010;0.825; <italic>P</italic>&#x003C;.001), also demonstrating moderate agreement.</p><p>Both the free-to-use and paid versions of ChatGPT had low consistency, producing the same score in all 3 runs for only 7 out of 40 (18%) and 4 out of 40 (10%) clinical encounters, respectively. Additionally, the free-to-use version of ChatGPT had a significantly (<italic>P</italic>=.004) smaller average range of scores (mean 1.73, SD 1.18; range 0-4) when compared to that of the paid version of ChatGPT (mean 2.85, SD 1.70; range 0-7) (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Consistency and variability of the unpaid and paid versions of ChatGPT.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Free-to-use ChatGPT (GPT-4o)</td><td align="left" valign="bottom">Paid ChatGPT (o4-mini-high)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Consistency (among 3 runs with the same prompt for each clinical encounter [N=40])<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">7 (18)</td><td align="left" valign="top">4 (10)</td><td align="left" valign="top">.33</td></tr><tr><td align="left" valign="top">Variability (average difference for each encounter)<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, mean (SD; range)</td><td align="left" valign="top">1.73 (1.18; 0-4)</td><td align="left" valign="top">2.85 (1.70; 0-7)</td><td align="left" valign="top">.004<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Chi-square test.</p></fn><fn id="table2fn2"><p><sup>b</sup>Wilcoxon signed rank test.</p></fn><fn id="table2fn3"><p><sup>c</sup><italic>P</italic>&#x003C;.005.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>This study found that using only transcripts to evaluate the clinical pediatric behavior guidance performance of dental students demonstrated good agreement when compared to using video evaluation. Moreover, transcripts required significantly less time to evaluate when compared to faculty evaluating full videos. Although the commercially available LLMs demonstrated moderate agreement with the faculty rater, they showed a lack of consistency, generating different scores under the same input parameters.</p><p>These findings have implications for resource optimization, as LLMs allow faculty to provide personalized feedback to more students, and the time saved allows for more clinical encounters per student. Adopting a theoretical framework of providing formative feedback continuously during training will have the greatest impact on student learning outcomes, which, in our case, was the improvement of clinical pediatric behavior guidance [<xref ref-type="bibr" rid="ref14">14</xref>]. The use of LLMs can automate this task, thereby reducing the time burden on faculty. However, similar to a previous study, the commercially available LLMs in our study showed a lack of consistency, generating different scores under the same input parameters [<xref ref-type="bibr" rid="ref9">9</xref>]. As the range of the LLM scores was relatively narrow and showed moderate agreement with the human assessor, future studies should evaluate if this lack of consistency would substantially alter the feedback provided or if the feedback could still be educationally useful. ChatGPT has been evaluated for other automated scoring applications, such as essay marking, with studies showing moderate to excellent correlation between ChatGPT and human assessors when using real-life datasets [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. However, these studies also found wide task-dependent [<xref ref-type="bibr" rid="ref15">15</xref>] and assessor-dependent [<xref ref-type="bibr" rid="ref16">16</xref>] variation, and they cautioned the need for good rubric and prompt design and further LLM development before large-scale adoption.</p><p>A strength of this study was the use of real-life clinical encounters, as opposed to previous studies&#x2019; use of simulated clinical encounters [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Notably, the greater language variation present in the dataset could explain the LLMs&#x2019; large variation in scores and poor consistency. Further, while secondary refinement of a prompt after the first run improves the consistency of subsequent runs [<xref ref-type="bibr" rid="ref18">18</xref>], this effect was lost when starting a new chat. This means that prompt engineering for every encounter or the use of an extremely complex and detailed prompt was necessary, making LLMs impractical for large-scale translation. Additionally, this study examined verbatim clinical encounter transcripts that contained informal language, such as slang and colloquialisms. Previous research showed that current LLMs have issues with processing informal language [<xref ref-type="bibr" rid="ref19">19</xref>] and require substantial fine-tuning to achieve better performance [<xref ref-type="bibr" rid="ref20">20</xref>]. This limitation in the LLMs used in this study possibly contributed to the large variation and poor consistency in scoring.</p><p>A limitation of this study was the use of only closed-source LLMs for the evaluation of pediatric behavior guidance. Both the free-to-use version and the paid version of ChatGPT resulted in similarly low consistency, with the free-to-use version demonstrating lower variability. It was difficult to speculate why the free-to-use version outperformed the paid version, as both were closed-source LLMs, but this warrants further investigation in future studies. Although closed-source LLMs provide several advantages, such as higher performance, easy implementation, and general accessibility due to significantly higher financial and technical backing, open-source LLMs are adaptable and cost-effective, thus allowing them to be customized and fine-tuned to specific domains, such as patient communication [<xref ref-type="bibr" rid="ref21">21</xref>]. The transparency and inclusivity of open-source frameworks, such as Large Language Model Meta AI (LLaMA), could better address the specific requirements of dealing with the large variation and informal language present in real-life clinical communication datasets [<xref ref-type="bibr" rid="ref22">22</xref>]. Future studies should compare the consistency and variability of closed-source and open-source LLMs.</p><p>In conclusion, using transcripts to evaluate students&#x2019; clinical behavior guidance was time-saving for the faculty and demonstrated good agreement with using videos for evaluation. Although LLMs can further automate the task, more work is needed to improve their consistency and accuracy. These findings can be used to improve the provision of targeted feedback to students for clinical teaching.</p></sec></body><back><ack><p>The authors would like to thank Associate Professor Catherine Hong for her work on the video scoring.</p></ack><notes><sec><title>Funding</title><p>This study was supported by the Faculty of Dentistry R Fund from the National University of Singapore, and article processing fee support was provided by the Faculty of Dentistry, National University of Singapore.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author (SH) upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>IKD is considered the first author, and SH is the senior author. IKD, GKYL, and SH conceived the idea and were responsible for funding acquisition. IKD and SH participated in data collection. SH conducted the data analysis. IKD, GKYL, and SH led the interpretation and writing. All authors revised the manuscript for important intellectual content.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>ABBREVIATIONS</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CHART</term><def><p>Chatbot Assessment Reporting Tool</p></def></def-item><def-item><term id="abb3">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb4">LLaMA</term><def><p>Large Language Model Meta AI</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davidovich</surname><given-names>E</given-names> </name><name name-style="western"><surname>Pessov</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Baniel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ram</surname><given-names>D</given-names> </name></person-group><article-title>Levels of stress among general practitioners, students and specialists in pediatric dentistry during dental treatment</article-title><source>J Clin Pediatr Dent</source><year>2015</year><volume>39</volume><issue>5</issue><fpage>419</fpage><lpage>422</lpage><pub-id pub-id-type="doi">10.17796/1053-4628-39.5.419</pub-id><pub-id pub-id-type="medline">26551363</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>York</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Mlinac</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Deibler</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Creed</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Ganem</surname><given-names>I</given-names> </name></person-group><article-title>Pediatric behavior management techniques: a survey of predoctoral dental students</article-title><source>J Dent Educ</source><year>2007</year><month>04</month><volume>71</volume><issue>4</issue><fpage>532</fpage><lpage>539</lpage><pub-id pub-id-type="medline">17468315</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bimstein</surname><given-names>E</given-names> </name><name name-style="western"><surname>Azari</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Sotto</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Riley</surname><given-names>JL 3rd</given-names> </name></person-group><article-title>Students&#x2019; perceptions about pediatric dental behavior guidance in an undergraduate four-year dental curriculum</article-title><source>J Dent Educ</source><year>2009</year><month>12</month><volume>73</volume><issue>12</issue><fpage>1366</fpage><lpage>1371</lpage><pub-id pub-id-type="medline">20007491</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wisniewski</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zierer</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hattie</surname><given-names>J</given-names> </name></person-group><article-title>The power of feedback revisited: a meta-analysis of educational feedback research</article-title><source>Front Psychol</source><year>2020</year><month>01</month><volume>10</volume><fpage>3087</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2019.03087</pub-id><pub-id pub-id-type="medline">32038429</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mok</surname><given-names>BYY</given-names> </name><name name-style="western"><surname>Tok</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Hong</surname><given-names>CHL</given-names> </name></person-group><article-title>Teaching pediatric behavior management in student dentists with constructive video feedback from faculty</article-title><source>J Dent Educ</source><year>2021</year><month>08</month><day>12</day><pub-id pub-id-type="doi">10.1002/jdd.12756</pub-id><pub-id pub-id-type="medline">34383296</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Br&#x00FC;gge</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ricchizzi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Arenbeck</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models improve clinical decision making of medical students through patient simulation and structured feedback: a randomized controlled trial</article-title><source>BMC Med Educ</source><year>2024</year><month>11</month><day>28</day><volume>24</volume><issue>1</issue><fpage>1391</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-06399-7</pub-id><pub-id pub-id-type="medline">39609823</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Holderried</surname><given-names>F</given-names> </name><name name-style="western"><surname>Stegemann-Philipps</surname><given-names>C</given-names> </name><name name-style="western"><surname>Herrmann-Werner</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A language model-powered simulated patient with automated feedback for history taking: prospective study</article-title><source>JMIR Med Educ</source><year>2024</year><month>08</month><day>16</day><volume>10</volume><fpage>e59213</fpage><pub-id pub-id-type="doi">10.2196/59213</pub-id><pub-id pub-id-type="medline">39150749</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>AlSaad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alhuwail</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>1</day><volume>9</volume><fpage>e48291</fpage><pub-id pub-id-type="doi">10.2196/48291</pub-id><pub-id pub-id-type="medline">37261894</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heston</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Lewis</surname><given-names>LM</given-names> </name></person-group><article-title>ChatGPT provides inconsistent risk-stratification of patients with atraumatic chest pain</article-title><source>PLoS One</source><year>2024</year><month>04</month><day>16</day><volume>19</volume><issue>4</issue><fpage>e0301854</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0301854</pub-id><pub-id pub-id-type="medline">38626142</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>CHART Collaborative</collab><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><etal/></person-group><article-title>Reporting guideline for chatbot health advice studies: the CHART statement</article-title><source>JAMA Netw Open</source><year>2025</year><month>08</month><day>1</day><volume>8</volume><issue>8</issue><fpage>e2530220</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.30220</pub-id><pub-id pub-id-type="medline">40747871</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>American Academy of Pediatric Dentistry</collab></person-group><article-title>Behavior guidance for the pediatric dental patient</article-title><source>The Reference Manual of Pediatric Dentistry</source><year>2025</year><access-date>2026-06-08</access-date><publisher-name>American Academy of Pediatric Dentistry</publisher-name><fpage>379</fpage><lpage>399</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.aapd.org/globalassets/media/policies_guidelines/bp_behavguide.pdf">https://www.aapd.org/globalassets/media/policies_guidelines/bp_behavguide.pdf</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cameron</surname><given-names>E</given-names> </name><name name-style="western"><surname>Forbes</surname><given-names>G</given-names> </name><name name-style="western"><surname>Humphris</surname><given-names>G</given-names> </name></person-group><article-title>Systematic review of the effect of dental staff behaviour on child dental patient anxiety and behaviour</article-title><source>Patient Educ Couns</source><year>2011</year><month>10</month><volume>85</volume><issue>1</issue><fpage>4</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1016/j.pec.2010.08.002</pub-id><pub-id pub-id-type="medline">20807676</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hattie</surname><given-names>J</given-names> </name><name name-style="western"><surname>Timperley</surname><given-names>H</given-names> </name></person-group><article-title>The power of feedback</article-title><source>Rev Educ Res</source><year>2007</year><month>03</month><volume>77</volume><issue>1</issue><fpage>81</fpage><lpage>112</lpage><pub-id pub-id-type="doi">10.3102/003465430298487</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quah</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sng</surname><given-names>TJH</given-names> </name><name name-style="western"><surname>Yong</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Islam</surname><given-names>I</given-names> </name></person-group><article-title>Reliability of ChatGPT in automated essay scoring for dental undergraduate examinations</article-title><source>BMC Med Educ</source><year>2024</year><month>09</month><day>3</day><volume>24</volume><issue>1</issue><fpage>962</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05881-6</pub-id><pub-id pub-id-type="medline">39227811</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Manning</surname><given-names>J</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Powell</surname><given-names>N</given-names> </name></person-group><article-title>Human versus machine: the effectiveness of ChatGPT in automated essay scoring</article-title><source>Innovations in Education and Teaching International</source><year>2025</year><volume>62</volume><issue>5</issue><fpage>1500</fpage><lpage>1513</lpage><pub-id pub-id-type="doi">10.1080/14703297.2025.2469089</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Webb</surname><given-names>JJ</given-names> </name></person-group><article-title>Proof of concept: using ChatGPT to teach emergency physicians how to break bad news</article-title><source>Cureus</source><year>2023</year><month>05</month><day>9</day><volume>15</volume><issue>5</issue><fpage>e38755</fpage><pub-id pub-id-type="doi">10.7759/cureus.38755</pub-id><pub-id pub-id-type="medline">37303324</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Prompt engineering in consistency and reliability with the evidence-based guideline for LLMs</article-title><source>NPJ Digit Med</source><year>2024</year><month>02</month><day>20</day><volume>7</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01029-4</pub-id><pub-id pub-id-type="medline">38378899</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wuraola</surname><given-names>I</given-names> </name><name name-style="western"><surname>Dethlefs</surname><given-names>N</given-names> </name><name name-style="western"><surname>Marciniak</surname><given-names>D</given-names> </name></person-group><article-title>Understanding slang with LLMs: modelling cross-cultural nuances through paraphrasing</article-title><conf-name>2024 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 12-16, 2024</conf-date><conf-loc>Miami, Florida</conf-loc><fpage>15525</fpage><lpage>15531</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.869</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zemel</surname><given-names>R</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name></person-group><article-title>Toward informal language processing: knowledge of slang in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.02323</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bu</surname><given-names>Y</given-names> </name></person-group><article-title>Position: open and closed large language models in healthcare</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 17, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.09906</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Manchanda</surname><given-names>J</given-names> </name><name name-style="western"><surname>Boettcher</surname><given-names>L</given-names> </name><name name-style="western"><surname>Westphalen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jasser</surname><given-names>J</given-names> </name></person-group><article-title>The open source advantage in large language models (LLMs)</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 16, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.12004</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts used to guide ChatGPT for the rating of pediatric behavior guidance performance.</p><media xlink:href="mededu_v12i1e83376_app1.png" xlink:title="PNG File, 534 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Interclass (Pearson coefficient) correlation of transcript scores against video scores and video scores without nonverbal components.</p><media xlink:href="mededu_v12i1e83376_app2.pdf" xlink:title="PDF File, 35 KB"/></supplementary-material><supplementary-material id="app3"><label>Checklist 1</label><p>CHART checklist.</p><media xlink:href="mededu_v12i1e83376_app3.pdf" xlink:title="PDF File, 251 KB"/></supplementary-material></app-group></back></article>