<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e68697</article-id><article-id pub-id-type="doi">10.2196/68697</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>AI&#x2019;s Accuracy in Extracting Learning Experiences From Clinical Practice Logs: Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kondo</surname><given-names>Takeshi</given-names></name><degrees>MD, MHPE, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nishigori</surname><given-names>Hiroshi</given-names></name><degrees>MD, MMEd, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Medical Education, Nagoya University Graduate School of Medicine</institution><addr-line>65, Tsurumai-cho, Showa-ku</addr-line><addr-line>Nagoya city, Aichi</addr-line><country>Japan</country></aff><aff id="aff2"><institution>The School of Health Professions Education, Maastricht University</institution><addr-line>Maastricht</addr-line><country>The Netherlands</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Shojaei</surname><given-names>Fereshtehossadat</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mavrych</surname><given-names>Volodymyr</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Asada</surname><given-names>Yoshikazu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Takeshi Kondo, MD, MHPE, PhD, Center for Medical Education, Nagoya University Graduate School of Medicine, 65, Tsurumai-cho, Showa-ku, Nagoya city, Aichi, 466-8560, Japan, +81 052 7412111; <email>ncukondo@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>15</day><month>10</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e68697</elocation-id><history><date date-type="received"><day>17</day><month>11</month><year>2024</year></date><date date-type="rev-recd"><day>29</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>22</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Takeshi Kondo, Hiroshi Nishigori. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 15.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e68697"/><abstract><sec><title>Background</title><p>Improving the quality of education in clinical settings requires an understanding of learners&#x2019; experiences and learning processes. However, this is a significant burden on learners and educators. If learners&#x2019; learning records could be automatically analyzed and their experiences could be visualized, this would enable real-time tracking of their progress. Large language models (LLMs) may be useful for this purpose, although their accuracy has not been sufficiently studied.</p></sec><sec><title>Objective</title><p>This study aimed to explore the accuracy of predicting the actual clinical experiences of medical students from their learning log data during clinical clerkship using LLMs.</p></sec><sec sec-type="methods"><title>Methods</title><p>This study was conducted at the Nagoya University School of Medicine. Learning log data from medical students participating in a clinical clerkship from April 22, 2024, to May 24, 2024, were used. The Model Core Curriculum for Medical Education was used as a template to extract experiences. OpenAI&#x2019;s ChatGPT was selected for this task after a comparison with other LLMs. Prompts were created using the learning log data and provided to ChatGPT to extract experiences, which were then listed. A web application using GPT-4-turbo was developed to automate this process. The accuracy of the extracted experiences was evaluated by comparing them with the corrected lists provided by the students.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 20 sixth-year medical students participated in this study, resulting in 40 datasets. The overall Jaccard index was 0.59 (95% CI 0.46-0.71), and the Cohen &#x03BA; was 0.65 (95% CI 0.53-0.76). Overall sensitivity was 62.39% (95% CI 49.96%-74.81%), and specificity was 99.34% (95% CI 98.77%-99.92%). Category-specific performance varied: symptoms showed a sensitivity of 45.43% (95% CI 25.12%-65.75%) and specificity of 98.75% (95% CI 97.31%-100%), examinations showed a sensitivity of 46.76% (95% CI 25.67%-67.86%) and specificity of 98.84% (95% CI 97.81%-99.87%), and procedures achieved a sensitivity of 56.36% (95% CI 37.64%-75.08%) and specificity of 98.92% (95% CI 96.67%-100%). The results suggest that GPT-4-turbo accurately identified many of the actual experiences but missed some because of insufficient detail or a lack of student records.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrated that LLMs such as GPT-4-turbo can predict clinical experiences from learning logs with high specificity but moderate sensitivity. Future improvements in AI models, providing feedback to medical students&#x2019; learning logs and combining them with other data sources such as electronic medical records, may enhance the accuracy. Using artificial intelligence to analyze learning logs for assessment could reduce the burden on learners and educators while improving the quality of educational assessments in medical education.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>ChatGPT</kwd><kwd>workplace-based assessment</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>To improve the quality of education in clinical settings, it is important to understand what learners experience and how they learn [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Various workplace-based assessment tools have been developed and used to enable educators to track learners&#x2019; progress and provide feedback [<xref ref-type="bibr" rid="ref3">3</xref>]. However, the rigorous management of learners&#x2019; progress requires frequent observation of learners, frequent evaluations, and feedback from educators. This can impose a high burden on both learners and educators, potentially hindering learning [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Thus, the challenge is accurately monitoring learning in clinical settings without burdening learners or educators.</p><p>Learners in clinical settings often document their learning and practice experiences. If these records can be analyzed to understand learners&#x2019; contexts, monitoring their learning without imposing additional burdens may be possible. One such record kept by learners during clinical clerkship is a logbook. The logbook documents the cases encountered, procedures performed, and learners&#x2019; reflections. It serves as a tool for prompting student reflections and facilitating feedback and dialogue between educators and learners [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Evaluating these records against curriculum competencies and goals without adding an extra burden on learners can help monitor their progress [<xref ref-type="bibr" rid="ref9">9</xref>]. However, educators may have to manually match and analyze these records, which may be a significant burden [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Artificial intelligence (AI)&#x2013;assisted text extraction and standard matching could be useful in this context. Previous studies have successfully used natural language processing, a branch of AI, to analyze supervisory feedback comments and predict student performance against competency standards [<xref ref-type="bibr" rid="ref10">10</xref>]. AI models that integrate multiple information sources to represent student performance have also been developed [<xref ref-type="bibr" rid="ref11">11</xref>]. Among AI technologies, large language models (LLMs) have gained attention in medical education because of their extensive pretraining on large datasets, allowing them to handle various situations, including multilingual support, with minimal adjustment [<xref ref-type="bibr" rid="ref12">12</xref>]. Research using ChatGPT, an LLM, has shown that it can apply codes to interview texts using a codebook, suggesting its potential for extracting competency-based evaluations from student descriptions [<xref ref-type="bibr" rid="ref13">13</xref>]. However, owing to a lack of such research, aggregation accuracy remains uncertain. Determining the extent to which LLMs can aggregate items related to curriculum goals from learner descriptions may open up opportunities to leverage LLMs to monitor learner progress and enhance education quality.</p><p>In Japanese undergraduate education, the Model Core Curriculum for Medical Education (MCC) [<xref ref-type="bibr" rid="ref14">14</xref>] was established to define two-thirds of the undergraduate curriculum and is used as a guideline for undergraduate medical education. The MCC outlines the experiences that medical students should have by the time they graduate, focusing primarily on clinical clerkships [<xref ref-type="bibr" rid="ref14">14</xref>]. In Japanese clinical clerkships, medical students are partially observed directly by supervisors [<xref ref-type="bibr" rid="ref15">15</xref>], but it is difficult for busy supervisors to grasp the full scope of experiences that medical students encounter [<xref ref-type="bibr" rid="ref14">14</xref>]. If experiences could be understood through analysis of learning logs kept by medical students, valuable information for improving the learning environment could be obtained.</p></sec><sec id="s1-2"><title>Objectives</title><p>Therefore, this study focused on undergraduate clinical clerkships in Japan to investigate the accuracy with which LLMs can aggregate goals from records kept for learning. Our research question was as follows: how accurately can an LLM predict experiences related to the goals defined by the MCC from the records that students keep for learning during clinical clerkships?</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Context</title><p>This study was conducted as part of the participatory clinical clerkship at the Nagoya University School of Medicine, a program designed to provide medical students with practical experience in clinical settings. During the final year of medical school (sixth year), students participate in this program for 4 weeks, recording their daily experiences and learning activities. A trial to transform these records into an electronic portfolio began in 2024. This study was part of this trial.</p></sec><sec id="s2-2"><title>Dataset</title><p>This study used learning log data from sixth-year medical students to extract their experiences related to core curriculum goals. Learning log data consisted of daily records of experiences and learning activities entered by medical students into an electronic portfolio during a clinical clerkship from April 22, 2024, to May 24, 2024. The data were treated as weekly datasets.</p></sec><sec id="s2-3"><title>Extraction of Experiences</title><p>The template for extracting experiences from the dataset was the MCC [<xref ref-type="bibr" rid="ref14">14</xref>]. This study used a table of symptoms, examinations, and procedures that medical students are expected to encounter in patients during their clinical clerkship at Nagoya University School of Medicine as the template for experience extraction (<xref ref-type="other" rid="box1">Textbox 1</xref>).</p><boxed-text id="box1"><title> Symptoms, examinations, and procedures that medical students are expected to encounter in patients.</title><p><bold>Symptoms</bold></p><list list-type="bullet"> <list-item><p>Fever</p></list-item> <list-item><p>General malaise</p></list-item> <list-item><p>Anorexia</p></list-item> <list-item><p>Weight loss</p></list-item> <list-item><p>Weight gain</p></list-item> <list-item><p>Altered mental status</p></list-item> <list-item><p>Syncope</p></list-item><list-item><p>Seizure</p></list-item> <list-item><p>Vertigo and dizziness</p></list-item> <list-item><p>Edema</p></list-item> <list-item><p>Rash</p></list-item> <list-item><p>Cough and sputum production</p></list-item> <list-item><p>Blood in sputum and hemoptysis</p></list-item> <list-item><p>Dyspnea</p></list-item> <list-item><p>Chest pain</p></list-item> <list-item><p>Palpitations</p></list-item> <list-item><p>Dysphagia</p></list-item> <list-item><p>Abdominal pain</p></list-item> <list-item><p>Nausea and vomiting</p></list-item> <list-item><p>Hematemesis</p></list-item> <list-item><p>Melena</p></list-item> <list-item><p>Constipation</p></list-item> <list-item><p>Diarrhea</p></list-item> <list-item><p>Jaundice</p></list-item> <list-item><p>Abdominal distention and abdominal mass</p></list-item> <list-item><p>Lymphadenopathy</p></list-item> <list-item><p>Abnormal urine output or urination</p></list-item> <list-item><p>Hematuria</p></list-item> <list-item><p>Menstrual abnormality</p></list-item> <list-item><p>Anxiety or depression</p></list-item> <list-item><p>Cognitive dysfunction</p></list-item> <list-item><p>Headache</p></list-item> <list-item><p>Skeletal muscle paralysis or muscle weakness</p></list-item> <list-item><p>Gait disturbance</p></list-item> <list-item><p>Sensory disturbance</p></list-item> <list-item><p>Back pain</p></list-item> <list-item><p>Arthralgia or joint swelling</p></list-item> </list><p><bold>Examinations</bold></p><list list-type="bullet"><list-item><p>Full blood count</p></list-item><list-item><p>Blood biochemistry</p></list-item><list-item><p>Coagulation or fibrinolysis</p></list-item><list-item><p>Immunoserology tests</p></list-item><list-item><p>Urinalysis</p></list-item><list-item><p>Stool (fecal) examination</p></list-item><list-item><p>Blood typing (ABO, and RhHD), blood compatibility test (cross-matching), and atypical antibody screening</p></list-item><list-item><p>Arterial blood gas analysis</p></list-item><list-item><p>Pregnancy test</p></list-item><list-item><p>Microbiological tests (bacterial smear, culture, identification, and antibiotic sensitivity test)</p></list-item><list-item><p>Cerebrospinal fluid</p></list-item><list-item><p>Pleural fluid analysis</p></list-item><list-item><p>Peritoneal fluid analysis</p></list-item><list-item><p>Histopathology and cytology (including intraoperative rapid diagnosis)</p></list-item><list-item><p>Genetic testing and chromosome analysis</p></list-item><list-item><p>Electrocardiography (ECG)</p></list-item><list-item><p>Lung function tests</p></list-item><list-item><p>Endocrine and metabolic function tests</p></list-item><list-item><p>Electroencephalography</p></list-item><list-item><p>Ultrasound</p></list-item><list-item><p>X-ray</p></list-item><list-item><p>Computed tomography</p></list-item><list-item><p>Magnetic resonance imaging</p></list-item><list-item><p>Nuclear medicine examination</p></list-item><list-item><p>Endoscopy</p></list-item></list><p><bold>Procedures</bold></p><list list-type="bullet"><list-item><p>Position change and transfer</p></list-item><list-item><p>Skin antisepsis</p></list-item><list-item><p>Application of topical medications</p></list-item><list-item><p>Airway suction</p></list-item><list-item><p>Nebulizer</p></list-item><list-item><p>Venous blood sampling</p></list-item><list-item><p>Peripheral venous catheterization</p></list-item><list-item><p>Insertion and extraction of nasogastric tube</p></list-item><list-item><p>Insertion and extraction of urinary catheter</p></list-item><list-item><p>Intradermal injection</p></list-item><list-item><p>Subcutaneous injection</p></list-item><list-item><p>Intramuscular injection</p></list-item><list-item><p>Intravenous injection</p></list-item><list-item><p>Urinalysis (including pregnancy test)</p></list-item><list-item><p>Microbiological testing (including Gram staining)</p></list-item><list-item><p>Recording of a 12-lead ECG</p></list-item><list-item><p>Rapid bedside ultrasound (including focused assessment with sonography for trauma [FAST]) for clinical decision-making</p></list-item><list-item><p>Rapid antigen or pathogen testing</p></list-item><list-item><p>Blood glucose test</p></list-item><list-item><p>Aseptic technique</p></list-item><list-item><p>Surgical hand washing</p></list-item><list-item><p>Gowning techniques in the operating room</p></list-item><list-item><p>Basic sutures and suture removal</p></list-item></list></boxed-text><p>OpenAI&#x2019;s ChatGPT, Google&#x2019;s Gemini, and Anthropic&#x2019;s Claude were considered for the LLMs used in experience extraction. Trial prompts and randomly selected student records were entered into each web platform, and the extracted results were compared in terms of validity. Validity was evaluated from the perspective of whether the output followed the expected format, whether the output matched the experience items expected from the text, and whether the output was reproducible. ChatGPT by OpenAI produced the most valid outputs, so it was selected for this study.</p><p>LLMs, including ChatGPT, receive text data as input and generate subsequent text based on these data. Therefore, the prompt given to the LLM is crucial. In this study, prompts were created using medical students&#x2019; learning log data, which were provided to ChatGPT to extract their experiences from the logs. Experiences were extracted based on a table of symptoms, examinations, and procedures that students were expected to experience, with ChatGPT outputting a list of symptoms, examinations, and procedures inferred from the text data. To automate this process, a web application using GPT-4-turbo was developed, which allowed medical students to input learning log data and receive the extracted experiences as a list output from GPT-4-turbo (gpt-4-0125-preview). The prompt used for GPT-4-turbo and the web application code are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-4"><title>Evaluation of Extracted Experiences</title><p>The extracted experience goals were presented to the medical students via email. Students were asked to compare the list with their actual experiences, including those not recorded in their reflections, and submit a corrected list. The corrected lists were compared with the original learning log data to evaluate the accuracy of the extracted experiences.</p></sec><sec id="s2-5"><title>Data Analysis</title><p>The accuracy of the extracted experience goals was evaluated using the R software (version 4.1.2; R Foundation for Statistical Computing). The agreement rate between the extracted and corrected experience goals was calculated, and the accuracy of the extracted experience goals was assessed based on this agreement rate.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study was approved by the ethics committee of Nagoya University Graduate School of Medicine (approval 2023-0451 31742). All participants were informed about the study&#x2019;s purpose, methods, risks, and benefits and were allowed to opt out. All data were fully anonymized and handled to prevent the identification of individuals. No compensation was provided to participants in this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Period, Participants, and Data Characteristics</title><p>During the clinical participation-based clerkship at Nagoya University Hospital from April 22, 2024, to May 24, 2024, a total of 61% (20/33) of the sixth-year students who made entries in the e-portfolio participated in the study, yielding 40 data points. All records were written in Japanese, with an average letter count of 446.2 (SD 353.52; range 72-1473). The predicted and actual experiences are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Predicted and actual experience items.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Record index</td><td align="left" valign="bottom">Predicted item</td><td align="left" valign="bottom">Actual item</td><td align="left" valign="bottom">Number of matches</td><td align="left" valign="bottom">Number of experienced items extracted by GPT-4-turbo</td><td align="left" valign="bottom">Number of items that the students marked as experiences they had</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Skeletal muscle paralysis or muscle weakness, gait disturbance, and sensory disturbance</td><td align="left" valign="top">Skeletal muscle paralysis or muscle weakness, gait disturbance, and sensory disturbance</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Endocrine and metabolic function tests</td><td align="left" valign="top">Endocrine and metabolic function tests</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Fever</td><td align="left" valign="top">Fever</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Seizure, electroencephalography, and MRI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Aseptic technique, electroencephalography, MRI, weight gain, and seizure</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Skeletal muscle paralysis or muscle weakness, gait disturbance, and sensory disturbance</td><td align="left" valign="top">Skeletal muscle paralysis or muscle weakness, gait disturbance, and sensory disturbance</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Anorexia, abdominal distention and abdominal mass, and ultrasound</td><td align="left" valign="top">Palpitations and skeletal muscle paralysis or muscle weakness</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Venous blood sampling</td><td align="left" valign="top">Venous blood sampling</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Rapid bedside ultrasound (including FAST<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>) for clinical decision-making and ultrasound</td><td align="left" valign="top">Skin antisepsis, rapid bedside ultrasound (including FAST) for clinical decision-making, aseptic technique, surgical handwashing, gowning techniques in the operating room, basic sutures and suture removal, ultrasound, fever, and diarrhea</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">9</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="left" valign="top">Surgical handwashing, gowning techniques in the operating room, basic sutures and suture removal, full blood count, blood biochemistry, coagulation and fibrinolysis, histopathology and cytology (including intraoperative rapid diagnosis), x-ray, CT<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, MRI, general malaise, and weight loss</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">Venous blood sampling and pregnancy test</td><td align="left" valign="top">Venous blood sampling</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Surgical handwashing, gowning techniques in the operating room, and basic sutures and suture removal</td><td align="left" valign="top">Surgical handwashing, gowning techniques in the operating room, and basic sutures and suture removal</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Pregnancy test and basic sutures and suture removal</td><td align="left" valign="top">Position change and transfer, insertion and extraction of a urinary catheter, surgical handwashing, gowning techniques in the operating room, basic sutures and suture removal, histopathology and cytology (including intraoperative rapid diagnosis), MRI, and abdominal distention and abdominal mass</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Surgical handwashing</td><td align="left" valign="top">Surgical handwashing, gowning techniques in the operating room, basic sutures and suture removal, full blood count, blood biochemistry, histopathology and cytology (including intraoperative rapid diagnosis), ultrasound, and x-ray</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Microbiological tests (bacterial smear, culture, identification, and antibiotic sensitivity test), nuclear medicine examination, general malaise, cough and sputum production, dyspnea, abdominal pain, nausea and vomiting, and abnormal urine output or urination</td><td align="left" valign="top">General malaise and edema</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">8</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">17</td><td align="left" valign="top">Surgical handwashing</td><td align="left" valign="top">Aseptic technique, surgical handwashing, gowning techniques in the operating room, basic sutures and suture removal, and cough and sputum production</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top">18</td><td align="left" valign="top">Fever, urinalysis, microbiological tests (bacterial smear, culture, identification, and antibiotic sensitivity test), nausea and vomiting, and hematuria</td><td align="left" valign="top">Full blood count, blood biochemistry, immunoserology tests, urinalysis, microbiological tests (bacterial smear, culture, identification, and antibiotic sensitivity test), edema, palpitations, hematuria, and back pain</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">5</td><td align="left" valign="top">9</td></tr><tr><td align="left" valign="top">19</td><td align="left" valign="top">Blood glucose test and endocrine and metabolic function tests</td><td align="left" valign="top">Blood glucose test and endocrine and metabolic function tests</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">20</td><td align="left" valign="top">Cognitive dysfunction</td><td align="left" valign="top">Cognitive dysfunction</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">21</td><td align="left" valign="top">Chest pain</td><td align="left" valign="top">Chest pain</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">22</td><td align="left" valign="top">Surgical handwashing and basic sutures and suture removal</td><td align="left" valign="top">Aseptic technique, surgical handwashing, gowning techniques in the operating room, and basic sutures and suture removal</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">23</td><td align="left" valign="top">Cognitive dysfunction, abnormal urine output or urination, and urinalysis</td><td align="left" valign="top">Cognitive dysfunction and abnormal urine output or urination</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">24</td><td align="left" valign="top">Dyspnea</td><td align="left" valign="top">Dyspnea</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">25</td><td align="left" valign="top">Gowning techniques in the operating room</td><td align="left" valign="top">Position change and transfer, skin antisepsis, aseptic technique, surgical handwashing, gowning techniques in the operating room, and basic sutures and suture removal</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top">26</td><td align="left" valign="top">Full blood count and blood biochemistry</td><td align="left" valign="top">Full blood count, blood biochemistry, immunoserology tests, and edema</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">27</td><td align="left" valign="top">CT, MRI, and x-ray</td><td align="left" valign="top">Position change and transfer, full blood count, arterial blood gas analysis, ultrasound, x-ray, CT, MRI, skeletal muscle paralysis or muscle weakness, gait disturbance, and back pain</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">28</td><td align="left" valign="top">Endocrine and metabolic function tests</td><td align="left" valign="top">Endocrine and metabolic function tests</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">29</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="left" valign="top">Position change and transfer, surgical handwashing, gowning techniques in the operating room, and basic sutures and suture removal</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">30</td><td align="left" valign="top">Weight loss and skeletal muscle paralysis or muscle weakness</td><td align="left" valign="top">Blood glucose test, weight loss, and skeletal muscle paralysis or muscle weakness</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">31</td><td align="left" valign="top">Ultrasound and endoscopy</td><td align="left" valign="top">Ultrasound and endoscopy</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">32</td><td align="left" valign="top">Basic sutures and suture removal</td><td align="left" valign="top">Skin antisepsis, aseptic technique, surgical handwashing, gowning techniques in the operating room, basic sutures and suture removal, full blood count, blood biochemistry, coagulation or fibrinolysis, immunoserology tests, histopathology and cytology (including intraoperative rapid diagnosis), ultrasound, x-ray, and headache</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">13</td></tr><tr><td align="left" valign="top">33</td><td align="left" valign="top">Skin antisepsis and position change and transfer</td><td align="left" valign="top">Skin antisepsis and position change and transfer</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">34</td><td align="left" valign="top">Back pain</td><td align="left" valign="top">Weight loss, cognitive dysfunction, skeletal muscle paralysis or muscle weakness, sensory disturbance, and back pain</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">5</td></tr><tr><td align="left" valign="top">35</td><td align="left" valign="top">Arterial blood gas analysis, peripheral venous catheterization, insertion and extraction of a nasogastric tube, insertion and extraction of a urinary catheter, aseptic technique, surgical handwashing, gowning techniques in the operating room, and basic sutures and suture removal</td><td align="left" valign="top">Peripheral venous catheterization, aseptic technique, full blood count, blood biochemistry, coagulation or fibrinolysis, arterial blood gas analysis, pleural fluid analysis, ultrasound, x-ray, CT, and endoscopy</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">8</td><td align="left" valign="top">11</td></tr><tr><td align="left" valign="top">36</td><td align="left" valign="top">Weight gain, endocrine and metabolic function tests, and blood glucose test</td><td align="left" valign="top">Blood glucose test, full blood count, blood biochemistry, urinalysis, stool (fecal) examination, endocrine and metabolic function tests, ultrasound, CT, and weight gain</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">9</td></tr><tr><td align="left" valign="top">37</td><td align="left" valign="top">Endoscopy</td><td align="left" valign="top">Endoscopy</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">38</td><td align="left" valign="top">X-ray</td><td align="left" valign="top">X-ray and cough and sputum production</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">39</td><td align="left" valign="top">Abdominal pain</td><td align="left" valign="top">ID not found</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">40</td><td align="left" valign="top">Skin antisepsis</td><td align="left" valign="top">Skin antisepsis</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">1</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MRI: magnetic resonance imaging.</p></fn><fn id="table1fn2"><p><sup>b</sup>FAST: focused assessment with sonography for trauma.</p></fn><fn id="table1fn3"><p><sup>c</sup>CT: computed tomography.</p></fn></table-wrap-foot></table-wrap><p>The predicted items were experience items extracted using GPT-4-turbo from the students&#x2019; practice records. The actual items were those that the students marked as experiences they had during that period. The English-translated version of the students&#x2019; records used by GPT-4-turbo to extract experiences can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, with the &#x201C;Index&#x201D; column in <xref ref-type="table" rid="table1">Table 1</xref> corresponding to the &#x201C;Index&#x201D; column in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-2"><title>Agreement Between LLM Predictions and Student-Reported Experiences</title><p>The Jaccard index was 0.59 (95% CI 0.46-0.71), indicating moderate agreement, and the Cohen &#x03BA; was 0.65 (95% CI 0.53-0.76), indicating substantial agreement. Sensitivity and specificity were 62.39% (95% CI 49.96%-74.81%) and 99.34% (95% CI 98.77%-99.92%), respectively. The sensitivity and specificity of the LLM for each category were as follows: 45.43% (95% CI 25.12%-65.75%) and 98.75% (95% CI 97.31%-100%) for symptoms, 46.76% (95% CI 25.67%-67.86%) and 98.84% (95% CI 97.81%-99.87%) for examinations, and 56.36% (95% CI 37.64%-75.08%) and 98.92% (95% CI 96.67%-100%) for procedures, respectively. There was no significant variation among the categories. However, when calculating by category, the sensitivity tended to be lower than the overall calculation, likely due to the influence of items that were not extracted at all. The correlation between the number of characters in the students&#x2019; records and sensitivity and specificity was 0.04 and &#x2013;0.64, respectively, indicating a negligible correlation with sensitivity and a moderate negative correlation with specificity. The correlation coefficients for the Jaccard index and the Cohen &#x03BA; were 0.06 and &#x2013;0.07, respectively, showing negligible correlations with record length.</p></sec><sec id="s3-3"><title>Patterns of Missed Experiences</title><p>There were several patterns in experiences that were not captured by GPT-4-turbo&#x2019;s analysis even though students considered to have had those experiences. In this paragraph, we explain these patterns with examples corresponding to specific entries in <xref ref-type="table" rid="table1">Table 1</xref>. Due to the large volume of student records, the full texts are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> rather than <xref ref-type="table" rid="table1">Table 1</xref>. One pattern was when predictable experiences were not picked up by GPT-4-turbo&#x2019;s analysis. For example, a student (index 19 in <xref ref-type="table" rid="table1">Table 1</xref>) described encountering a case of hereditary amyotrophic lateral sclerosis, but GPT-4-turbo&#x2019;s analysis failed to capture the student&#x2019;s experience with muscle weakness, a symptom of amyotrophic lateral sclerosis. Another pattern was when insufficient description made prediction difficult. In total, 20% (8/40) of the students (indexes 9, 11, 15, 17, 22, 25, 29, and 32 in <xref ref-type="table" rid="table1">Table 1</xref>) recorded observing surgery, but it was unclear from the description whether they assisted in the surgery or merely observed, making it difficult for GPT-4-turbo to extract related procedures such as surgical handwashing and gowning techniques. A third pattern was when experiences were not recorded by the students, making prediction impossible. For instance, a student recorded observing a surgery (index 15 in <xref ref-type="table" rid="table1">Table 1</xref>) but actually performed suturing, an experience not captured by GPT-4-turbo due to lack of record. Similarly, a student (index 30 in <xref ref-type="table" rid="table1">Table 1</xref>) noted examining a patient with diabetes but did not record performing computed tomography or ultrasound examinations.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we analyzed the records kept by medical students during their clinical clerkship for learning purposes using GPT-4-turbo to predict the clinical procedures they experienced. The experiences extracted by GPT-4-turbo were evaluated for accuracy after being revised by the medical students. The extraction of experiences by GPT-4-turbo showed a sufficient level of agreement with the items that students actually experienced and demonstrated high specificity. The high specificity suggests that the extracted experiences likely mirror what the students actually encountered. However, the low sensitivity indicates that some experiences that students actually had were not captured by GPT-4-turbo&#x2019;s analysis of the records. There were three main reasons why certain experiences could not be extracted: (1) experiences that could have been predicted by GPT-4-turbo&#x2019;s analysis were not identified; (2) the descriptions were insufficient, making prediction difficult; and (3) there were experiences that students did not record at all.</p></sec><sec id="s4-2"><title>Implications of Findings</title><p>The results of this study suggest that LLMs such as GPT-4-turbo are able to extract experiences from learning records with sufficient accuracy. On the other hand, when the content of the learning records is insufficient or when students do not record their experiences, experience extraction becomes difficult, indicating that improving the accuracy of LLMs alone may not be sufficient.</p></sec><sec id="s4-3"><title>Comparison to the Literature</title><p>Comparison with previous studies suggests that LLMs are making it easier and more accurate to extract experiences from learning records. Unlike previous studies [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], which required extensive pretraining on large text datasets, this study was able to extract experiences from learning records using only prompt engineering without additional training. A related study using LLMs investigated how well GPT-3 could extract predefined codes from documents and compared its results to those of human coders [<xref ref-type="bibr" rid="ref13">13</xref>]. In that study, providing 5 examples for each code resulted in a Cohen &#x03BA; of 0.61 for some codes, although the Cohen &#x03BA; for most codes was lower. In contrast, our study used GPT-4-turbo to extract experiences from learning records without providing specific examples, achieving a Cohen &#x03BA; of 0.65. Although direct comparison is difficult due to differences in study targets, GPT-4-turbo may have achieved higher extraction accuracy. These findings indicate that, with the advent and evolution of LLMs, extracting experiences from learning records is becoming easier and potentially more accurate.</p><p>In addition, this study demonstrates that performance monitoring is possible by analyzing narrative records primarily intended for student learning using LLMs rather than aggregating list-based records mainly used for evaluation, as in some previous studies. Previous studies have explored the use of logbooks to monitor learners&#x2019; progress of learning. Attempts have been made to monitor skills and experiences using logbooks [<xref ref-type="bibr" rid="ref16">16</xref>], track the progress of entrustable professional activities [<xref ref-type="bibr" rid="ref8">8</xref>], and count the cases encountered [<xref ref-type="bibr" rid="ref7">7</xref>]. However, the &#x201C;logbooks&#x201D; used in these studies were lists of cases experienced or evaluations rather than detailed descriptions of experiences [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. This format is more useful for evaluation purposes rather than for recording learning, which ultimately adds to the burden on learners. Our study suggests that analyzing reflections purely recorded for learning purposes can also extract experiences, offering a technique that monitors learning situations while reducing the burden on learners and educators.</p></sec><sec id="s4-4"><title>Future Directions</title><p>While this study demonstrated the usefulness of experience extraction by LLMs, it also highlighted new challenges. GPT-4-turbo failed to extract some experiences that students actually had. The first pattern involved experience items that could not be extracted despite being predictable from the learning log content. The second pattern involved cases in which descriptions were ambiguous, making inference difficult. The third pattern involved experiences that medical students believed they had but were not recorded in learning logs, making inference impossible. Regarding the first pattern, insufficient reasoning ability of GPT-4-turbo is considered the cause. However, the reasoning ability of LLMs is improving with model evolution [<xref ref-type="bibr" rid="ref17">17</xref>], and future improvements in LLM accuracy may partially address this issue. Regarding the second and third patterns, insufficient content in medical students&#x2019; learning logs appears to be the cause, resulting in inadequate information to infer students&#x2019; experiences. To address these challenges, it may be necessary to enrich students&#x2019; learning records or extract experiences from other sources.</p><p>To enhance the quality of medical students&#x2019; learning records, providing feedback using the list of experiences extracted by LLMs may be beneficial. Previous studies have shown that logbooks are useful for performance monitoring and improving educational quality, but they have also pointed out that the quality of the records is often insufficient and that feedback is needed [<xref ref-type="bibr" rid="ref18">18</xref>]. In this study, medical students reviewed the list of experiences extracted from their learning records by GPT-4-turbo and added items that they had actually experienced but were not extracted. Since missing or incomplete records can be a reason for experiences not being extracted, this review process may serve as feedback for students, helping them reflect on what they failed to document in their records. As shown in the development version of the web application in <xref ref-type="fig" rid="figure1">Figure 1</xref>, displaying experience items extracted from learning logs might motivate students to improve their learning log documentation.</p><p>Combining other data such as electronic health records written by the students might be effective for more accurate monitoring of medical students&#x2019; performance. Feeding both learning logs and electronic health record descriptions into GPT-4-turbo could enhance the accuracy of experience extraction. Such an approach could lead to more accurate assessment of medical students without increasing the burden on students or faculty. However, since many LLMs, including GPT-4-turbo, are cloud-based, privacy concerns may arise [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Therefore, new approaches will need to be developed to address these privacy issues in the future.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Development version of the web application.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e68697_fig01.png"/></fig></sec><sec id="s4-5"><title>Limitations</title><p>This study has several limitations. First, this study used learning log data from clinical participation-based clerkships at a single university; therefore, its generalizability to learning log data from other universities or clinical clerkships is not guaranteed. In addition, the data collection period was limited to 1 month, which may not capture the full range of experiences or seasonal variations in clinical activities. While the accuracy of the extracted experience content was evaluated by using learning log data recorded by medical students and asking them to make corrections, the quality and quantity of the learning log data recorded by the students could affect the accuracy of the extracted experience content. Large-scale collaborative studies across multiple institutions and over longer periods are needed to ensure broader generalizability. Furthermore, this study used a list of symptoms, examinations, and procedures in the MCC as a template for extracting experience content; however, the results of using other templates were not examined. Future research is needed to assess performance using other evaluation criteria. Although we confirmed the correlation between record length and extraction sensitivity and specificity, we did not quantitatively evaluate the quality of the records. Future work should investigate the relationship between record quality and extraction performance. In this study, the accuracy of the extracted experience content was evaluated by using learning log data recorded by medical students and asking them to make corrections, but no strict criteria were set for what constitutes &#x201C;experience&#x201D; when students made corrections. Moreover, students&#x2019; judgments about whether they had actually experienced a procedure are subjective, and they may have overreported certain experiences or overlooked ones they truly had. In the clinical clerkship that served as this study&#x2019;s setting, supervising physicians did not continuously monitor students, so only the students themselves could verify their experiences. Therefore, we had to rely on students&#x2019; subjective reports. In future work, it will be desirable to establish more objective evaluation criteria to reduce potential bias.</p></sec><sec id="s4-6"><title>Conclusions</title><p>In this study, records kept by medical students for learning during clinical clerkships were analyzed using GPT-4-turbo to predict experienced clinical activities. The high specificity of the GPT-4-turbo predictions suggests that the extracted experiences are likely what students actually encountered. However, the low sensitivity indicates that some actual student experiences were not captured by the GPT-4-turbo analysis. Future improvements in AI model performance, providing feedback to medical students on their records and combining learning logs with other data sources such as electronic medical records, may enhance accuracy. Analyzing records using AI may enable detailed assessments while avoiding excessive burdens on learners and educators.</p></sec></sec></body><back><ack><p>ChatGPT (OpenAI) was used in part to create an initial English translation of the Japanese version of this manuscript. This work was supported by Japan Society for the Promotion of Sciences Grants-in-Aid for Scientific Research 23K27816 and 25K06542.</p></ack><fn-group><fn fn-type="con"><p>TK was responsible for study planning, data collection and analysis, and manuscript writing. HN collaborated with TK on study planning and provided supervision and advice on data analysis and manuscript writing.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MCC</term><def><p>Model Core Curriculum for Medical Education</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>AlHaqwi</surname><given-names>AI</given-names> </name><name name-style="western"><surname>Taha</surname><given-names>WS</given-names> </name></person-group><article-title>Promoting excellence in teaching and learning in clinical education</article-title><source>J Taibah Univ Med Sci</source><year>2015</year><month>03</month><volume>10</volume><issue>1</issue><fpage>97</fpage><lpage>101</lpage><pub-id pub-id-type="doi">10.1016/j.jtumed.2015.02.005</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vanka</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hovaguimian</surname><given-names>A</given-names> </name></person-group><article-title>Teaching strategies for the clinical environment</article-title><source>Clin Teach</source><year>2019</year><month>12</month><volume>16</volume><issue>6</issue><fpage>570</fpage><lpage>574</lpage><pub-id pub-id-type="doi">10.1111/tct.12928</pub-id><pub-id pub-id-type="medline">30178546</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name></person-group><article-title>An introduction to workplace-based assessments</article-title><source>Gastroenterol Hepatol Bed Bench</source><year>2012</year><volume>5</volume><issue>1</issue><fpage>24</fpage><lpage>28</lpage><pub-id pub-id-type="medline">24834194</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ott</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Pack</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cristancho</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Van Koughnett</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name></person-group><article-title>&#x201C;The Most Crushing Thing&#x201D;: understanding resident assessment burden in a competency-based curriculum</article-title><source>J Grad Med Educ</source><year>2022</year><month>10</month><volume>14</volume><issue>5</issue><fpage>583</fpage><lpage>592</lpage><pub-id pub-id-type="doi">10.4300/JGME-D-22-00050.1</pub-id><pub-id pub-id-type="medline">36274774</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Szulewski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Braund</surname><given-names>H</given-names> </name><name name-style="western"><surname>Dagnone</surname><given-names>DJ</given-names> </name><etal/></person-group><article-title>The assessment burden in competency-based medical education: how programs are adapting</article-title><source>Acad Med</source><year>2023</year><month>11</month><day>1</day><volume>98</volume><issue>11</issue><fpage>1261</fpage><lpage>1267</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005305</pub-id><pub-id pub-id-type="medline">37343164</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alotaibi</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Alharithy</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alotaibi</surname><given-names>HM</given-names> </name></person-group><article-title>Importance of the reflective logbook in improving the residents&#x2019; perception of reflective learning in the dermatology residency program in Saudi Arabia: findings from a cross-sectional study</article-title><source>BMC Med Educ</source><year>2022</year><month>12</month><day>13</day><volume>22</volume><issue>1</issue><fpage>862</fpage><pub-id pub-id-type="doi">10.1186/s12909-022-03948-w</pub-id><pub-id pub-id-type="medline">36514091</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alabbad</surname><given-names>J</given-names> </name><name name-style="western"><surname>Abdul Raheem</surname><given-names>F</given-names> </name><name name-style="western"><surname>Almusaileem</surname><given-names>A</given-names> </name><name name-style="western"><surname>Almusaileem</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alsaddah</surname><given-names>S</given-names> </name><name name-style="western"><surname>Almubarak</surname><given-names>A</given-names> </name></person-group><article-title>Medical students&#x2019; logbook case loads do not predict final exam scores in surgery clerkship</article-title><source>Adv Med Educ Pract</source><year>2018</year><volume>9</volume><fpage>259</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.2147/AMEP.S160514</pub-id><pub-id pub-id-type="medline">29713211</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Berberat</surname><given-names>PO</given-names> </name><name name-style="western"><surname>Rotthoff</surname><given-names>T</given-names> </name><name name-style="western"><surname>Baerwald</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Entrustable professional activities in final year undergraduate medical training - advancement of the final year training logbook in Germany</article-title><source>GMS J Med Educ</source><year>2019</year><volume>36</volume><issue>6</issue><fpage>Doc70</fpage><pub-id pub-id-type="doi">10.3205/zma001278</pub-id><pub-id pub-id-type="medline">31844642</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>AbdulAzeem Abdullah Omer</surname><given-names>A</given-names> </name></person-group><article-title>Using logbooks to enhance students&#x2019; learning: lessons from a mixed-methods study in an undergraduate surgical rotation</article-title><source>Sudan J Med Sci</source><year>2021</year><volume>16</volume><issue>3</issue><fpage>409</fpage><lpage>429</lpage><pub-id pub-id-type="doi">10.18502/sjms.v16i3.9701</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gin</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Ten Cate</surname><given-names>O</given-names> </name><name name-style="western"><surname>O&#x2019;Sullivan</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Hauer</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Boscardin</surname><given-names>C</given-names> </name></person-group><article-title>Exploring how feedback reflects entrustment decisions using artificial intelligence</article-title><source>Med Educ</source><year>2022</year><month>03</month><volume>56</volume><issue>3</issue><fpage>303</fpage><lpage>311</lpage><pub-id pub-id-type="doi">10.1111/medu.14696</pub-id><pub-id pub-id-type="medline">34773415</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mill&#x00E1;n</surname><given-names>E</given-names> </name><name name-style="western"><surname>Loboda</surname><given-names>T</given-names> </name><name name-style="western"><surname>P&#x00E9;rez-de-la-Cruz</surname><given-names>JL</given-names> </name></person-group><article-title>Bayesian networks for student model engineering</article-title><source>Comput Educ</source><year>2010</year><month>12</month><volume>55</volume><issue>4</issue><fpage>1663</fpage><lpage>1683</lpage><pub-id pub-id-type="doi">10.1016/j.compedu.2010.07.010</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title><source>Healthcare (Basel)</source><year>2023</year><month>03</month><day>19</day><volume>11</volume><issue>6</issue><fpage>887</fpage><pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id><pub-id pub-id-type="medline">36981544</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>QV</given-names> </name><name name-style="western"><surname>Abdelghani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Oudeyer</surname><given-names>PY</given-names> </name></person-group><article-title>Supporting qualitative analysis with large language models: combining codebook with GPT-3 for deductive coding</article-title><conf-name>Proceedings of the 28th International Conference on Intelligent User Interfaces (IUI &#x2019;23)</conf-name><conf-date>Mar 27-31, 2023</conf-date><conf-loc>Sydney, Australia</conf-loc><pub-id pub-id-type="doi">10.1145/3581754.3584136</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>The Model Core Curriculum for Medical Education (2022 revision)</article-title><source>Medical Education Model Core Curriculum Expert Research Committee</source><year>2022</year><access-date>2025-10-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://jsme.umin.ac.jp/eng/activities/index.html">http://jsme.umin.ac.jp/eng/activities/index.html</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nishigori</surname><given-names>H</given-names> </name></person-group><article-title>Medical education in Japan</article-title><source>Med Teach</source><year>2024</year><month>06</month><day>4</day><volume>46</volume><issue>sup1</issue><fpage>S4</fpage><lpage>S10</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2024.2372108</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levine</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Kern</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>SM</given-names> </name></person-group><article-title>The impact of prompted narrative writing during internship on reflective practice: a qualitative study</article-title><source>Adv Health Sci Educ Theory Pract</source><year>2008</year><month>12</month><volume>13</volume><issue>5</issue><fpage>723</fpage><lpage>733</lpage><pub-id pub-id-type="doi">10.1007/s10459-007-9079-x</pub-id><pub-id pub-id-type="medline">17899421</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kosinski</surname><given-names>M</given-names> </name></person-group><article-title>Evaluating large language models in theory of mind tasks</article-title><source>Proc Natl Acad Sci U S A</source><year>2024</year><month>11</month><day>5</day><volume>121</volume><issue>45</issue><fpage>e2405460121</fpage><pub-id pub-id-type="doi">10.1073/pnas.2405460121</pub-id><pub-id pub-id-type="medline">39471222</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Paydar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Esmaeeli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ameri</surname><given-names>F</given-names> </name><name name-style="western"><surname>Sabahi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Meraji</surname><given-names>M</given-names> </name></person-group><article-title>Investigating the advantages and disadvantages of electronic logbooks for education goals promotion in medical sciences students: a systematic review</article-title><source>Health Sci Rep</source><year>2023</year><month>12</month><volume>6</volume><issue>12</issue><fpage>e1776</fpage><pub-id pub-id-type="doi">10.1002/hsr2.1776</pub-id><pub-id pub-id-type="medline">38125281</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Samsi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>D</given-names> </name><name name-style="western"><surname>McDonald</surname><given-names>J</given-names> </name><etal/></person-group><article-title>From words to watts: benchmarking the energy costs of large language model inference</article-title><conf-name>Proceedings of the 2023 IEEE High Performance Extreme Computing Conference (HPEC &#x2019;23)</conf-name><conf-date>Sep 25-29, 2023</conf-date><conf-loc>Boston, MA</conf-loc><pub-id pub-id-type="doi">10.1109/HPEC58863.2023.10363447</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Madaan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aggarwal</surname><given-names>P</given-names> </name><name name-style="western"><surname>Anand</surname><given-names>A</given-names> </name><etal/></person-group><article-title>AutoMix: automatically mixing language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.12963</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The prompts for the OpenAI application programming interface (API) and GitHub repository include an experience extraction API, prompt used in API, R code to analyze the data, and the data themselves.</p><media xlink:href="mededu_v11i1e68697_app1.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Student records.</p><media xlink:href="mededu_v11i1e68697_app2.xlsx" xlink:title="XLSX File, 23 KB"/></supplementary-material></app-group></back></article>