@Article{info:doi/10.2196/70420, author="Quon, Stephanie and Zhou, Sarah", title="Enhancing AI-Driven Medical Translations: Considerations for Language Concordance", journal="JMIR Med Educ", year="2025", month="Apr", day="11", volume="11", pages="e70420", keywords="letter to the editor", keywords="ChatGPT", keywords="AI", keywords="artificial intelligence", keywords="language", keywords="translation", keywords="health care disparity", keywords="natural language model", keywords="survey", keywords="patient education", keywords="accessibility", keywords="preference", keywords="human language", keywords="communication", keywords="language-concordant care", doi="10.2196/70420", url="https://mededu.jmir.org/2025/1/e70420" } @Article{info:doi/10.2196/71721, author="Teng, Joyce and Novoa, Andres Roberto and Aleshin, Alexandrovna Maria and Lester, Jenna and Seiger, Kira and Dzuali, Fiatsogbe and Daneshjou, Roxana", title="Authors' Reply: Enhancing AI-Driven Medical Translations: Considerations for Language Concordance", journal="JMIR Med Educ", year="2025", month="Apr", day="11", volume="11", pages="e71721", keywords="ChatGPT", keywords="artificial intelligence", keywords="language", keywords="translation", keywords="health care disparity", keywords="natural language model", keywords="survey", keywords="patient education", keywords="accessibility", keywords="preference", keywords="human language", keywords="communication", keywords="language-concordant care", doi="10.2196/71721", url="https://mededu.jmir.org/2025/1/e71721" } @Article{info:doi/10.2196/65726, author="K?yak, Selim Yavuz and Kononowicz, A. Andrzej", title="Using a Hybrid of AI and Template-Based Method in Automatic Item Generation to Create Multiple-Choice Questions in Medical Education: Hybrid AIG", journal="JMIR Form Res", year="2025", month="Apr", day="4", volume="9", pages="e65726", keywords="automatic item generation", keywords="ChatGPT", keywords="artificial intelligence", keywords="large language models", keywords="medical education", keywords="AI", keywords="hybrid", keywords="template-based method", keywords="hybrid AIG", keywords="mixed-method", keywords="multiple-choice question", keywords="multiple-choice", keywords="human-AI collaboration", keywords="human-AI", keywords="algorithm", keywords="expert", abstract="Background: Template-based automatic item generation (AIG) is more efficient than traditional item writing but it still heavily relies on expert effort in model development. While nontemplate-based AIG, leveraging artificial intelligence (AI), offers efficiency, it faces accuracy challenges. Medical education, a field that relies heavily on both formative and summative assessments with multiple choice questions, is in dire need of AI-based support for the efficient automatic generation of items. Objective: We aimed to propose a hybrid AIG to demonstrate whether it is possible to generate item templates using AI in the field of medical education. Methods: This is a mixed-methods methodological study with proof-of-concept elements. We propose the hybrid AIG method as a structured series of interactions between a human subject matter expert and AI, designed as a collaborative authoring effort. The method leverages AI to generate item models (templates) and cognitive models to combine the advantages of the two AIG approaches. To demonstrate how to create item models using hybrid AIG, we used 2 medical multiple-choice questions: one on respiratory infections in adults and another on acute allergic reactions in the pediatric population. Results: The hybrid AIG method we propose consists of 7 steps. The first 5 steps are performed by an expert in a customized AI environment. These involve providing a parent item, identifying elements for manipulation, selecting options and assigning values to elements, and generating the cognitive model. After a final expert review (Step 6), the content in the template can be used for item generation through a traditional (non-AI) software (Step 7). We showed that AI is capable of generating item templates for AIG under the control of a human expert in only 10 minutes. Leveraging AI in template development made it less challenging. Conclusions: The hybrid AIG method transcends the traditional template-based approach by marrying the ``art'' that comes from AI as a ``black box'' with the ``science'' of algorithmic generation under the oversight of expert as a ``marriage registrar''. It does not only capitalize on the strengths of both approaches but also mitigates their weaknesses, offering a human-AI collaboration to increase efficiency in medical education. ", doi="10.2196/65726", url="https://formative.jmir.org/2025/1/e65726" } @Article{info:doi/10.2196/68486, author="Cook, A. David and Overgaard, Joshua and Pankratz, Shane V. and Del Fiol, Guilherme and Aakre, A. Chris", title="Virtual Patients Using Large Language Models: Scalable, Contextualized Simulation of Clinician-Patient Dialogue With Feedback", journal="J Med Internet Res", year="2025", month="Apr", day="4", volume="27", pages="e68486", keywords="simulation training", keywords="natural language processing", keywords="computer-assisted instruction", keywords="clinical decision-making", keywords="clinical reasoning", keywords="machine learning", keywords="virtual patient", keywords="natural language generation", abstract="Background: Virtual patients (VPs) are computer screen--based simulations of patient-clinician encounters. VP use is limited by cost and low scalability. Objective: We aimed to show that VPs powered by large language models (LLMs) can generate authentic dialogues, accurately represent patient preferences, and provide personalized feedback on clinical performance. We also explored using LLMs to rate the quality of dialogues and feedback. Methods: We conducted an intrinsic evaluation study rating 60 VP-clinician conversations. We used carefully engineered prompts to direct OpenAI's generative pretrained transformer (GPT) to emulate a patient and provide feedback. Using 2 outpatient medicine topics (chronic cough diagnosis and diabetes management), each with permutations representing different patient preferences, we created 60 conversations (dialogues plus feedback): 48 with a human clinician and 12 ``self-chat'' dialogues with GPT role-playing both the VP and clinician. Primary outcomes were dialogue authenticity and feedback quality, rated using novel instruments for which we conducted a validation study collecting evidence of content, internal structure (reproducibility), relations with other variables, and response process. Each conversation was rated by 3 physicians and by GPT. Secondary outcomes included user experience, bias, patient preferences represented in the dialogues, and conversation features that influenced authenticity. Results: The average cost per conversation was US \$0.51 for GPT-4.0-Turbo and US \$0.02 for GPT-3.5-Turbo. Mean (SD) conversation ratings, maximum 6, were overall dialogue authenticity 4.7 (0.7), overall user experience 4.9 (0.7), and average feedback quality 4.7 (0.6). For dialogues created using GPT-4.0-Turbo, physician ratings of patient preferences aligned with intended preferences in 20 to 47 of 48 dialogues (42\%-98\%). Subgroup comparisons revealed higher ratings for dialogues using GPT-4.0-Turbo versus GPT-3.5-Turbo and for human-generated versus self-chat dialogues. Feedback ratings were similar for human-generated versus GPT-generated ratings, whereas authenticity ratings were lower. We did not perceive bias in any conversation. Dialogue features that detracted from authenticity included that GPT was verbose or used atypical vocabulary (93/180, 51.7\% of conversations), was overly agreeable (n=56, 31\%), repeated the question as part of the response (n=47, 26\%), was easily convinced by clinician suggestions (n=35, 19\%), or was not disaffected by poor clinician performance (n=32, 18\%). For feedback, detractors included excessively positive feedback (n=42, 23\%), failure to mention important weaknesses or strengths (n=41, 23\%), or factual inaccuracies (n=39, 22\%). Regarding validation of dialogue and feedback scores, items were meticulously developed (content evidence), and we confirmed expected relations with other variables (higher ratings for advanced LLMs and human-generated dialogues). Reproducibility was suboptimal, due largely to variation in LLM performance rather than rater idiosyncrasies. Conclusions: LLM-powered VPs can simulate patient-clinician dialogues, demonstrably represent patient preferences, and provide personalized performance feedback. This approach is scalable, globally accessible, and inexpensive. LLM-generated ratings of feedback quality are similar to human ratings. ", doi="10.2196/68486", url="https://www.jmir.org/2025/1/e68486", url="http://www.ncbi.nlm.nih.gov/pubmed/39854611" } @Article{info:doi/10.2196/72998, author="Zhang, Manlin and Zhao, Tianyu", title="Citation Accuracy Challenges Posed by Large Language Models", journal="JMIR Med Educ", year="2025", month="Apr", day="2", volume="11", pages="e72998", keywords="chatGPT", keywords="medical education", keywords="Saudi Arabia", keywords="perceptions", keywords="knowledge", keywords="medical students", keywords="faculty", keywords="chatbot", keywords="qualitative study", keywords="artificial intelligence", keywords="AI", keywords="AI-based tools", keywords="universities", keywords="thematic analysis", keywords="learning", keywords="satisfaction", keywords="LLM", keywords="large language model", doi="10.2196/72998", url="https://mededu.jmir.org/2025/1/e72998" } @Article{info:doi/10.2196/73698, author="Temsah, Mohamad-Hani and Al-Eyadhy, Ayman and Jamal, Amr and Alhasan, Khalid and Malki, H. Khalid", title="Authors' Reply: Citation Accuracy Challenges Posed by Large Language Models", journal="JMIR Med Educ", year="2025", month="Apr", day="2", volume="11", pages="e73698", keywords="ChatGPT", keywords="Gemini", keywords="DeepSeek", keywords="medical education", keywords="AI", keywords="artificial intelligence", keywords="Saudi Arabia", keywords="perceptions", keywords="medical students", keywords="faculty", keywords="LLM", keywords="chatbot", keywords="qualitative study", keywords="thematic analysis", keywords="satisfaction", keywords="RAG retrieval-augmented generation", doi="10.2196/73698", url="https://mededu.jmir.org/2025/1/e73698" } @Article{info:doi/10.2196/62857, author="Yan, Zelin and Liu, Jingwen and Fan, Yihong and Lu, Shiyuan and Xu, Dingting and Yang, Yun and Wang, Honggang and Mao, Jie and Tseng, Hou-Chiang and Chang, Tao-Hsing and Chen, Yan", title="Ability of ChatGPT to Replace Doctors in Patient Education: Cross-Sectional Comparative Analysis of Inflammatory Bowel Disease", journal="J Med Internet Res", year="2025", month="Mar", day="31", volume="27", pages="e62857", keywords="AI-assisted", keywords="patient education", keywords="inflammatory bowel disease", keywords="artificial intelligence", keywords="ChatGPT", keywords="patient communities", keywords="social media", keywords="disease management", keywords="readability", keywords="online health information", keywords="conversational agents", abstract="Background: Although large language models (LLMs) such as ChatGPT show promise for providing specialized information, their quality requires further evaluation. This is especially true considering that these models are trained on internet text and the quality of health-related information available online varies widely. Objective: The aim of this study was to evaluate the performance of ChatGPT in the context of patient education for individuals with chronic diseases, comparing it with that of industry experts to elucidate its strengths and limitations. Methods: This evaluation was conducted in September 2023 by analyzing the responses of ChatGPT and specialist doctors to questions posed by patients with inflammatory bowel disease (IBD). We compared their performance in terms of subjective accuracy, empathy, completeness, and overall quality, as well as readability to support objective analysis. Results: In a series of 1578 binary choice assessments, ChatGPT was preferred in 48.4\% (95\% CI 45.9\%-50.9\%) of instances. There were 12 instances where ChatGPT's responses were unanimously preferred by all evaluators, compared with 17 instances for specialist doctors. In terms of overall quality, there was no significant difference between the responses of ChatGPT (3.98, 95\% CI 3.93-4.02) and those of specialist doctors (3.95, 95\% CI 3.90-4.00; t524=0.95, P=.34), both being considered ``good.'' Although differences in accuracy (t521=0.48, P=.63) and empathy (t511=2.19, P=.03) lacked statistical significance, the completeness of textual output (t509=9.27, P<.001) was a distinct advantage of the LLM (ChatGPT). In the sections of the questionnaire where patients and doctors responded together (Q223-Q242), ChatGPT demonstrated inferior performance (t36=2.91, P=.006). Regarding readability, no statistical difference was found between the responses of specialist doctors (median: 7th grade; Q1: 4th grade; Q3: 8th grade) and those of ChatGPT (median: 7th grade; Q1: 7th grade; Q3: 8th grade) according to the Mann-Whitney U test (P=.09). The overall quality of ChatGPT's output exhibited strong correlations with other subdimensions (with empathy: r=0.842; with accuracy: r=0.839; with completeness: r=0.795), and there was also a high correlation between the subdimensions of accuracy and completeness (r=0.762). Conclusions: ChatGPT demonstrated more stable performance across various dimensions. Its output of health information content is more structurally sound, addressing the issue of variability in the information from individual specialist doctors. ChatGPT's performance highlights its potential as an auxiliary tool for health information, despite limitations such as artificial intelligence hallucinations. It is recommended that patients be involved in the creation and evaluation of health information to enhance the quality and relevance of the information. ", doi="10.2196/62857", url="https://www.jmir.org/2025/1/e62857" } @Article{info:doi/10.2196/58375, author="Madrid, Julian and Diehl, Philipp and Selig, Mischa and Rolauffs, Bernd and Hans, Patricius Felix and Busch, Hans-J{\"o}rg and Scheef, Tobias and Benning, Leo", title="Performance of Plug-In Augmented ChatGPT and Its Ability to Quantify Uncertainty: Simulation Study on the German Medical Board Examination", journal="JMIR Med Educ", year="2025", month="Mar", day="21", volume="11", pages="e58375", keywords="medical education", keywords="artificial intelligence", keywords="generative AI", keywords="large language model", keywords="LLM", keywords="ChatGPT", keywords="GPT-4", keywords="board licensing examination", keywords="professional education", keywords="examination", keywords="student", keywords="experimental", keywords="bootstrapping", keywords="confidence interval", abstract="Background: The GPT-4 is a large language model (LLM) trained and fine-tuned on an extensive dataset. After the public release of its predecessor in November 2022, the use of LLMs has seen a significant spike in interest, and a multitude of potential use cases have been proposed. In parallel, however, important limitations have been outlined. Particularly, current LLMs encounter limitations, especially in symbolic representation and accessing contemporary data. The recent version of GPT-4, alongside newly released plugin features, has been introduced to mitigate some of these limitations. Objective: Before this background, this work aims to investigate the performance of GPT-3.5, GPT-4, GPT-4 with plugins, and GPT-4 with plugins using pretranslated English text on the German medical board examination. Recognizing the critical importance of quantifying uncertainty for LLM applications in medicine, we furthermore assess this ability and develop a new metric termed ``confidence accuracy'' to evaluate it. Methods: We used GPT-3.5, GPT-4, GPT-4 with plugins, and GPT-4 with plugins and translation to answer questions from the German medical board examination. Additionally, we conducted an analysis to assess how the models justify their answers, the accuracy of their responses, and the error structure of their answers. Bootstrapping and CIs were used to evaluate the statistical significance of our findings. Results: This study demonstrated that available GPT models, as LLM examples, exceeded the minimum competency threshold established by the German medical board for medical students to obtain board certification to practice medicine. Moreover, the models could assess the uncertainty in their responses, albeit exhibiting overconfidence. Additionally, this work unraveled certain justification and reasoning structures that emerge when GPT generates answers. Conclusions: The high performance of GPTs in answering medical questions positions it well for applications in academia and, potentially, clinical practice. Its capability to quantify uncertainty in answers suggests it could be a valuable artificial intelligence agent within the clinical decision-making loop. Nevertheless, significant challenges must be addressed before artificial intelligence agents can be robustly and safely implemented in the medical domain. ", doi="10.2196/58375", url="https://mededu.jmir.org/2025/1/e58375" } @Article{info:doi/10.2196/70222, author="Andalib, Saman and Spina, Aidin and Picton, Bryce and Solomon, S. Sean and Scolaro, A. John and Nelson, M. Ariana", title="Using AI to Translate and Simplify Spanish Orthopedic Medical Text: Instrument Validation Study", journal="JMIR AI", year="2025", month="Mar", day="21", volume="4", pages="e70222", keywords="large language models", keywords="LLM", keywords="patient education", keywords="translation", keywords="bilingual evaluation understudy", keywords="GPT-4", keywords="Google Translate", abstract="Background: Language barriers contribute significantly to health care disparities in the United States, where a sizable proportion of patients are exclusively Spanish speakers. In orthopedic surgery, such barriers impact both patients' comprehension of and patients' engagement with available resources. Studies have explored the utility of large language models (LLMs) for medical translation but have yet to robustly evaluate artificial intelligence (AI)--driven translation and simplification of orthopedic materials for Spanish speakers. Objective: This study used the bilingual evaluation understudy (BLEU) method to assess translation quality and investigated the ability of AI to simplify patient education materials (PEMs) in Spanish. Methods: PEMs (n=78) from the American Academy of Orthopaedic Surgery were translated from English to Spanish, using 2 LLMs (GPT-4 and Google Translate). The BLEU methodology was applied to compare AI translations with professionally human-translated PEMs. The Friedman test and Dunn multiple comparisons test were used to statistically quantify differences in translation quality. A readability analysis and feature analysis were subsequently performed to evaluate text simplification success and the impact of English text features on BLEU scores. The capability of an LLM to simplify medical language written in Spanish was also assessed. Results: As measured by BLEU scores, GPT-4 showed moderate success in translating PEMs into Spanish but was less successful than Google Translate. Simplified PEMs demonstrated improved readability when compared to original versions (P<.001) but were unable to reach the targeted grade level for simplification. The feature analysis revealed that the total number of syllables and average number of syllables per sentence had the highest impact on BLEU scores. GPT-4 was able to significantly reduce the complexity of medical text written in Spanish (P<.001). Conclusions: Although Google Translate outperformed GPT-4 in translation accuracy, LLMs, such as GPT-4, may provide significant utility in translating medical texts into Spanish and simplifying such texts. We recommend considering a dual approach---using Google Translate for translation and GPT-4 for simplification---to improve medical information accessibility and orthopedic surgery education among Spanish-speaking patients. ", doi="10.2196/70222", url="https://ai.jmir.org/2025/1/e70222" } @Article{info:doi/10.2196/58897, author="Tseng, Liang-Wei and Lu, Yi-Chin and Tseng, Liang-Chi and Chen, Yu-Chun and Chen, Hsing-Yu", title="Performance of ChatGPT-4 on Taiwanese Traditional Chinese Medicine Licensing Examinations: Cross-Sectional Study", journal="JMIR Med Educ", year="2025", month="Mar", day="19", volume="11", pages="e58897", keywords="artificial intelligence", keywords="AI language understanding tools", keywords="ChatGPT", keywords="natural language processing", keywords="machine learning", keywords="Chinese medicine license exam", keywords="Chinese medical licensing examination", keywords="medical education", keywords="traditional Chinese medicine", keywords="large language model", abstract="Background: The integration of artificial intelligence (AI), notably ChatGPT, into medical education, has shown promising results in various medical fields. Nevertheless, its efficacy in traditional Chinese medicine (TCM) examinations remains understudied. Objective: This study aims to (1) assess the performance of ChatGPT on the TCM licensing examination in Taiwan and (2) evaluate the model's explainability in answering TCM-related questions to determine its suitability as a TCM learning tool. Methods: We used the GPT-4 model to respond to 480 questions from the 2022 TCM licensing examination. This study compared the performance of the model against that of licensed TCM doctors using 2 approaches, namely direct answer selection and provision of explanations before answer selection. The accuracy and consistency of AI-generated responses were analyzed. Moreover, a breakdown of question characteristics was performed based on the cognitive level, depth of knowledge, types of questions, vignette style, and polarity of questions. Results: ChatGPT achieved an overall accuracy of 43.9\%, which was lower than that of 2 human participants (70\% and 78.4\%). The analysis did not reveal a significant correlation between the accuracy of the model and the characteristics of the questions. An in-depth examination indicated that errors predominantly resulted from a misunderstanding of TCM concepts (55.3\%), emphasizing the limitations of the model with regard to its TCM knowledge base and reasoning capability. Conclusions: Although ChatGPT shows promise as an educational tool, its current performance on TCM licensing examinations is lacking. This highlights the need for enhancing AI models with specialized TCM training and suggests a cautious approach to utilizing AI for TCM education. Future research should focus on model improvement and the development of tailored educational applications to support TCM learning. ", doi="10.2196/58897", url="https://mededu.jmir.org/2025/1/e58897" } @Article{info:doi/10.2196/67696, author="Pastrak, Mila and Kajitani, Sten and Goodings, James Anthony and Drewek, Austin and LaFree, Andrew and Murphy, Adrian", title="Evaluation of ChatGPT Performance on Emergency Medicine Board Examination Questions: Observational Study", journal="JMIR AI", year="2025", month="Mar", day="12", volume="4", pages="e67696", keywords="artificial intelligence", keywords="ChatGPT-4", keywords="medical education", keywords="emergency medicine", keywords="examination", keywords="examination preparation", abstract="Background: The ever-evolving field of medicine has highlighted the potential for ChatGPT as an assistive platform. However, its use in medical board examination preparation and completion remains unclear. Objective: This study aimed to evaluate the performance of a custom-modified version of ChatGPT-4, tailored with emergency medicine board examination preparatory materials (Anki flashcard deck), compared to its default version and previous iteration (3.5). The goal was to assess the accuracy of ChatGPT-4 answering board-style questions and its suitability as a tool to aid students and trainees in standardized examination preparation. Methods: A comparative analysis was conducted using a random selection of 598 questions from the Rosh In-Training Examination Question Bank. The subjects of the study included three versions of ChatGPT: the Default, a Custom, and ChatGPT-3.5. The accuracy, response length, medical discipline subgroups, and underlying causes of error were analyzed. Results: The Custom version did not demonstrate a significant improvement in accuracy over the Default version (P=.61), although both significantly outperformed ChatGPT-3.5 (P<.001). The Default version produced significantly longer responses than the Custom version, with the mean (SD) values being 1371 (444) and 929 (408), respectively (P<.001). Subgroup analysis revealed no significant difference in the performance across different medical subdisciplines between the versions (P>.05 in all cases). Both the versions of ChatGPT-4 had similar underlying error types (P>.05 in all cases) and had a 99\% predicted probability of passing while ChatGPT-3.5 had an 85\% probability. Conclusions: The findings suggest that while newer versions of ChatGPT exhibit improved performance in emergency medicine board examination preparation, specific enhancement with a comprehensive Anki flashcard deck on the topic does not significantly impact accuracy. The study highlights the potential of ChatGPT-4 as a tool for medical education, capable of providing accurate support across a wide range of topics in emergency medicine in its default form. ", doi="10.2196/67696", url="https://ai.jmir.org/2025/1/e67696" } @Article{info:doi/10.2196/59210, author="Monzon, Noahlana and Hays, Alan Franklin", title="Leveraging Generative Artificial Intelligence to Improve Motivation and Retrieval in Higher Education Learners", journal="JMIR Med Educ", year="2025", month="Mar", day="11", volume="11", pages="e59210", keywords="educational technology", keywords="retrieval practice", keywords="flipped classroom", keywords="cognitive engagement", keywords="personalized learning", keywords="generative artificial intelligence", keywords="higher education", keywords="university education", keywords="learners", keywords="instructors", keywords="curriculum structure", keywords="learning", keywords="technologies", keywords="innovation", keywords="academic misconduct", keywords="gamification", keywords="self-directed", keywords="socio-economic disparities", keywords="interactive approach", keywords="medical education", keywords="chatGPT", keywords="machine learning", keywords="AI", keywords="large language models", doi="10.2196/59210", url="https://mededu.jmir.org/2025/1/e59210" } @Article{info:doi/10.2196/66207, author="Zada, Troy and Tam, Natalie and Barnard, Francois and Van Sittert, Marlize and Bhat, Venkat and Rambhatla, Sirisha", title="Medical Misinformation in AI-Assisted Self-Diagnosis: Development of a Method (EvalPrompt) for Analyzing Large Language Models", journal="JMIR Form Res", year="2025", month="Mar", day="10", volume="9", pages="e66207", keywords="ChatGPT", keywords="health care", keywords="LLM", keywords="misinformation", keywords="self-diagnosis", keywords="large language model", abstract="Background: Rapid integration of large language models (LLMs) in health care is sparking global discussion about their potential to revolutionize health care quality and accessibility. At a time when improving health care quality and access remains a critical concern for countries worldwide, the ability of these models to pass medical examinations is often cited as a reason to use them for medical training and diagnosis. However, the impact of their inevitable use as a self-diagnostic tool and their role in spreading health care misinformation has not been evaluated. Objective: This study aims to assess the effectiveness of LLMs, particularly ChatGPT, from the perspective of an individual self-diagnosing to better understand the clarity, correctness, and robustness of the models. Methods: We propose the comprehensive testing methodology evaluation of LLM prompts (EvalPrompt). This evaluation methodology uses multiple-choice medical licensing examination questions to evaluate LLM responses. Experiment 1 prompts ChatGPT with open-ended questions to mimic real-world self-diagnosis use cases, and experiment 2 performs sentence dropout on the correct responses from experiment 1 to mimic self-diagnosis with missing information. Humans then assess the responses returned by ChatGPT for both experiments to evaluate the clarity, correctness, and robustness of ChatGPT. Results: In experiment 1, we found that ChatGPT-4.0 was deemed correct for 31\% (29/94) of the questions by both nonexperts and experts, with only 34\% (32/94) agreement between the 2 groups. Similarly, in experiment 2, which assessed robustness, 61\% (92/152) of the responses continued to be categorized as correct by all assessors. As a result, in comparison to a passing threshold of 60\%, ChatGPT-4.0 is considered incorrect and unclear, though robust. This indicates that sole reliance on ChatGPT-4.0 for self-diagnosis could increase the risk of individuals being misinformed. Conclusions: The results highlight the modest capabilities of LLMs, as their responses are often unclear and inaccurate. Any medical advice provided by LLMs should be cautiously approached due to the significant risk of misinformation. However, evidence suggests that LLMs are steadily improving and could potentially play a role in health care systems in the future. To address the issue of medical misinformation, there is a pressing need for the development of a comprehensive self-diagnosis dataset. This dataset could enhance the reliability of LLMs in medical applications by featuring more realistic prompt styles with minimal information across a broader range of medical fields. ", doi="10.2196/66207", url="https://formative.jmir.org/2025/1/e66207" } @Article{info:doi/10.2196/60431, author="Kammies, Chamandra and Archer, Elize and Engel-Hills, Penelope and Volschenk, Mariette", title="Exploring Curriculum Considerations to Prepare Future Radiographers for an AI-Assisted Health Care Environment: Protocol for Scoping Review", journal="JMIR Res Protoc", year="2025", month="Mar", day="6", volume="14", pages="e60431", keywords="artificial intelligence", keywords="machine learning", keywords="radiography", keywords="education", keywords="scoping review", abstract="Background: The use of artificial intelligence (AI) technologies in radiography practice is increasing. As this advanced technology becomes more embedded in radiography systems and clinical practice, the role of radiographers will evolve. In the context of these anticipated changes, it may be reasonable to expect modifications to the competencies and educational requirements of current and future practitioners to ensure successful AI adoption. Objective: The aim of this scoping review is to explore and synthesize the literature on the adjustments needed in the radiography curriculum to prepare radiography students for the demands of AI-assisted health care environments. Methods: Using the Joanna Briggs Institute methodology, an initial search was run in Scopus to determine whether the search strategy that was developed with a library specialist would capture the relevant literature by screening the title and abstract of the first 50 articles. Additional search terms identified in the articles were added to the search strategy. Next, EBSCOhost, PubMed, and Web of Science databases were searched. In total, 2 reviewers will independently review the title, abstract, and full-text articles according to the predefined inclusion and exclusion criteria, with conflicts resolved by a third reviewer. Results: The search results will be reported using the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses Extension for Scoping Reviews) checklist. The final scoping review will present the data analysis as findings in tabular form and through narrative descriptions. The final database searches were completed in October 2024 and yielded 2224 records. Title and abstract screening of 1930 articles is underway after removing 294 duplicates. The scoping review is expected to be finalized by the end of March 2025. Conclusions: A scoping review aims to systematically map the evidence on the adjustments needed in the radiography curriculum to prepare radiography students for the integration of AI technologies in the health care environment. It is relevant to map the evidence because increased integration of AI-based technologies in clinical practice has been noted and changes in practice must be underpinned by appropriate education and training. The findings in this study will provide a better understanding of how the radiography curriculum should adapt to meet the educational needs of current and future radiographers to ensure competent and safe practice in response to AI technologies. Trial Registration: Open Science Framework 3nx2a; https://osf.io/3nx2a International Registered Report Identifier (IRRID): PRR1-10.2196/60431 ", doi="10.2196/60431", url="https://www.researchprotocols.org/2025/1/e60431", url="http://www.ncbi.nlm.nih.gov/pubmed/40053777" } @Article{info:doi/10.2196/65108, author="Prazeres, Filipe", title="ChatGPT's Performance on Portuguese Medical Examination Questions: Comparative Analysis of ChatGPT-3.5 Turbo and ChatGPT-4o Mini", journal="JMIR Med Educ", year="2025", month="Mar", day="5", volume="11", pages="e65108", keywords="ChatGPT-3.5 Turbo", keywords="ChatGPT-4o mini", keywords="medical examination", keywords="European Portuguese", keywords="AI performance evaluation", keywords="Portuguese", keywords="evaluation", keywords="medical examination questions", keywords="examination question", keywords="chatbot", keywords="ChatGPT", keywords="model", keywords="artificial intelligence", keywords="AI", keywords="GPT", keywords="LLM", keywords="NLP", keywords="natural language processing", keywords="machine learning", keywords="large language model", abstract="Background: Advancements in ChatGPT are transforming medical education by providing new tools for assessment and learning, potentially enhancing evaluations for doctors and improving instructional effectiveness. Objective: This study evaluates the performance and consistency of ChatGPT-3.5 Turbo and ChatGPT-4o mini in solving European Portuguese medical examination questions (2023 National Examination for Access to Specialized Training; Prova Nacional de Acesso {\`a} Forma{\c{c}}{\~a}o Especializada [PNA]) and compares their performance to human candidates. Methods: ChatGPT-3.5 Turbo was tested on the first part of the examination (74 questions) on July 18, 2024, and ChatGPT-4o mini on the second part (74 questions) on July 19, 2024. Each model generated an answer using its natural language processing capabilities. To test consistency, each model was asked, ``Are you sure?'' after providing an answer. Differences between the first and second responses of each model were analyzed using the McNemar test with continuity correction. A single-parameter t test compared the models' performance to human candidates. Frequencies and percentages were used for categorical variables, and means and CIs for numerical variables. Statistical significance was set at P<.05. Results: ChatGPT-4o mini achieved an accuracy rate of 65\% (48/74) on the 2023 PNA examination, surpassing ChatGPT-3.5 Turbo. ChatGPT-4o mini outperformed medical candidates, while ChatGPT-3.5 Turbo had a more moderate performance. Conclusions: This study highlights the advancements and potential of ChatGPT models in medical education, emphasizing the need for careful implementation with teacher oversight and further research. ", doi="10.2196/65108", url="https://mededu.jmir.org/2025/1/e65108" } @Article{info:doi/10.2196/62779, author="Doru, Berin and Maier, Christoph and Busse, Sophie Johanna and L{\"u}cke, Thomas and Sch{\"o}nhoff, Judith and Enax- Krumova, Elena and Hessler, Steffen and Berger, Maria and Tokic, Marianne", title="Detecting Artificial Intelligence--Generated Versus Human-Written Medical Student Essays: Semirandomized Controlled Study", journal="JMIR Med Educ", year="2025", month="Mar", day="3", volume="11", pages="e62779", keywords="artificial intelligence", keywords="ChatGPT", keywords="large language models", keywords="textual analysis", keywords="writing style", keywords="AI", keywords="chatbot", keywords="LLMs", keywords="detection", keywords="authorship", keywords="medical student", keywords="linguistic quality", keywords="decision-making", keywords="logical coherence", abstract="Background: Large language models, exemplified by ChatGPT, have reached a level of sophistication that makes distinguishing between human- and artificial intelligence (AI)--generated texts increasingly challenging. This has raised concerns in academia, particularly in medicine, where the accuracy and authenticity of written work are paramount. Objective: This semirandomized controlled study aims to examine the ability of 2 blinded expert groups with different levels of content familiarity---medical professionals and humanities scholars with expertise in textual analysis---to distinguish between longer scientific texts in German written by medical students and those generated by ChatGPT. Additionally, the study sought to analyze the reasoning behind their identification choices, particularly the role of content familiarity and linguistic features. Methods: Between May and August 2023, a total of 35 experts (medical: n=22; humanities: n=13) were each presented with 2 pairs of texts on different medical topics. Each pair had similar content and structure: 1 text was written by a medical student, and the other was generated by ChatGPT (version 3.5, March 2023). Experts were asked to identify the AI-generated text and justify their choice. These justifications were analyzed through a multistage, interdisciplinary qualitative analysis to identify relevant textual features. Before unblinding, experts rated each text on 6 characteristics: linguistic fluency and spelling/grammatical accuracy, scientific quality, logical coherence, expression of knowledge limitations, formulation of future research questions, and citation quality. Univariate tests and multivariate logistic regression analyses were used to examine associations between participants' characteristics, their stated reasons for author identification, and the likelihood of correctly determining a text's authorship. Results: Overall, in 48 out of 69 (70\%) decision rounds, participants accurately identified the AI-generated texts, with minimal difference between groups (medical: 31/43, 72\%; humanities: 17/26, 65\%; odds ratio [OR] 1.37, 95\% CI 0.5-3.9). While content errors had little impact on identification accuracy, stylistic features---particularly redundancy (OR 6.90, 95\% CI 1.01-47.1), repetition (OR 8.05, 95\% CI 1.25-51.7), and thread/coherence (OR 6.62, 95\% CI 1.25-35.2)---played a crucial role in participants' decisions to identify a text as AI-generated. Conclusions: The findings suggest that both medical and humanities experts were able to identify ChatGPT-generated texts in medical contexts, with their decisions largely based on linguistic attributes. The accuracy of identification appears to be independent of experts' familiarity with the text content. As the decision-making process primarily relies on linguistic attributes---such as stylistic features and text coherence---further quasi-experimental studies using texts from other academic disciplines should be conducted to determine whether instructions based on these features can enhance lecturers' ability to distinguish between student-authored and AI-generated work. ", doi="10.2196/62779", url="https://mededu.jmir.org/2025/1/e62779", url="http://www.ncbi.nlm.nih.gov/pubmed/40053752" } @Article{info:doi/10.2196/66478, author="Scherr, Riley and Spina, Aidin and Dao, Allen and Andalib, Saman and Halaseh, F. Faris and Blair, Sarah and Wiechmann, Warren and Rivera, Ronald", title="Novel Evaluation Metric and Quantified Performance of ChatGPT-4 Patient Management Simulations for Early Clinical Education: Experimental Study", journal="JMIR Form Res", year="2025", month="Feb", day="27", volume="9", pages="e66478", keywords="medical school simulations", keywords="AI in medical education", keywords="preclinical curriculum", keywords="ChatGPT", keywords="ChatGPT-4", keywords="medical simulation", keywords="simulation", keywords="multimedia", keywords="feedback", keywords="medical education", keywords="medical student", keywords="clinical education", keywords="pilot study", keywords="patient management", abstract="Background: Case studies have shown ChatGPT can run clinical simulations at the medical student level. However, no data have assessed ChatGPT's reliability in meeting desired simulation criteria such as medical accuracy, simulation formatting, and robust feedback mechanisms. Objective: This study aims to quantify ChatGPT's ability to consistently follow formatting instructions and create simulations for preclinical medical student learners according to principles of medical simulation and multimedia educational technology. Methods: Using ChatGPT-4 and a prevalidated starting prompt, the authors ran 360 separate simulations of an acute asthma exacerbation. A total of 180 simulations were given correct answers and 180 simulations were given incorrect answers. ChatGPT was evaluated for its ability to adhere to basic simulation parameters (stepwise progression, free response, interactivity), advanced simulation parameters (autonomous conclusion, delayed feedback, comprehensive feedback), and medical accuracy (vignette, treatment updates, feedback). Significance was determined with $\chi${\texttwosuperior} analyses using 95\% CIs for odds ratios. Results: In total, 100\% (n=360) of simulations met basic simulation parameters and were medically accurate. For advanced parameters, 55\% (200/360) of all simulations delayed feedback, while the Correct arm (157/180, 87\%) delayed feedback was significantly more than the Incorrect arm (43/180, 24\%; P<.001). A total of 79\% (285/360) of simulations concluded autonomously, and there was no difference between the Correct and Incorrect arms in autonomous conclusion (146/180, 81\% and 139/180, 77\%; P=.36). Overall, 78\% (282/360) of simulations gave comprehensive feedback, and there was no difference between the Correct and Incorrect arms in comprehensive feedback (137/180, 76\% and 145/180, 81\%; P=.31). ChatGPT-4 was not significantly more likely to conclude simulations autonomously (P=.34) and provide comprehensive feedback (P=.27) when feedback was delayed compared to when feedback was not delayed. Conclusions: These simulations have the potential to be a reliable educational tool for simple simulations and can be evaluated by a novel 9-part metric. Per this metric, ChatGPT simulations performed perfectly on medical accuracy and basic simulation parameters. It performed well on comprehensive feedback and autonomous conclusion. Delayed feedback depended on the accuracy of user inputs. A simulation meeting one advanced parameter was not more likely to meet all advanced parameters. Further work must be done to ensure consistent performance across a broader range of simulation scenarios. ", doi="10.2196/66478", url="https://formative.jmir.org/2025/1/e66478" } @Article{info:doi/10.2196/63400, author="Abouammoh, Noura and Alhasan, Khalid and Aljamaan, Fadi and Raina, Rupesh and Malki, H. Khalid and Altamimi, Ibraheem and Muaygil, Ruaim and Wahabi, Hayfaa and Jamal, Amr and Alhaboob, Ali and Assiri, Assad Rasha and Al-Tawfiq, A. Jaffar and Al-Eyadhy, Ayman and Soliman, Mona and Temsah, Mohamad-Hani", title="Perceptions and Earliest Experiences of Medical Students and Faculty With ChatGPT in Medical Education: Qualitative Study", journal="JMIR Med Educ", year="2025", month="Feb", day="20", volume="11", pages="e63400", keywords="ChatGPT", keywords="medical education", keywords="Saudi Arabia", keywords="perceptions", keywords="knowledge", keywords="medical students", keywords="faculty", keywords="chatbot", keywords="qualitative study", keywords="artificial intelligence", keywords="AI", keywords="AI-based tools", keywords="universities", keywords="thematic analysis", keywords="learning", keywords="satisfaction", abstract="Background: With the rapid development of artificial intelligence technologies, there is a growing interest in the potential use of artificial intelligence--based tools like ChatGPT in medical education. However, there is limited research on the initial perceptions and experiences of faculty and students with ChatGPT, particularly in Saudi Arabia. Objective: This study aimed to explore the earliest knowledge, perceived benefits, concerns, and limitations of using ChatGPT in medical education among faculty and students at a leading Saudi Arabian university. Methods: A qualitative exploratory study was conducted in April 2023, involving focused meetings with medical faculty and students with varying levels of ChatGPT experience. A thematic analysis was used to identify key themes and subthemes emerging from the discussions. Results: Participants demonstrated good knowledge of ChatGPT and its functions. The main themes were perceptions of ChatGPT use, potential benefits, and concerns about ChatGPT in research and medical education. The perceived benefits included collecting and summarizing information and saving time and effort. However, concerns and limitations centered around the potential lack of critical thinking in the information provided, the ambiguity of references, limitations of access, trust in the output of ChatGPT, and ethical concerns. Conclusions: This study provides valuable insights into the perceptions and experiences of medical faculty and students regarding the use of newly introduced large language models like ChatGPT in medical education. While the benefits of ChatGPT were recognized, participants also expressed concerns and limitations requiring further studies for effective integration into medical education, exploring the impact of ChatGPT on learning outcomes, student and faculty satisfaction, and the development of critical thinking skills. ", doi="10.2196/63400", url="https://mededu.jmir.org/2025/1/e63400", url="http://www.ncbi.nlm.nih.gov/pubmed/39977012" } @Article{info:doi/10.2196/66157, author="Potter, Alison and Munsch, Chris and Watson, Elaine and Hopkins, Emily and Kitromili, Sofia and O'Neill, Cameron Iain and Larbie, Judy and Niittymaki, Essi and Ramsay, Catriona and Burke, Joshua and Ralph, Neil", title="Identifying Research Priorities in Digital Education for Health Care: Umbrella Review and Modified Delphi Method Study", journal="J Med Internet Res", year="2025", month="Feb", day="19", volume="27", pages="e66157", keywords="digital education", keywords="health professions education", keywords="research priorities", keywords="umbrella review", keywords="Delphi", keywords="artificial intelligence", keywords="AI", abstract="Background: In recent years, the use of digital technology in the education of health care professionals has surged, partly driven by the COVID-19 pandemic. However, there is still a need for focused research to establish evidence of its effectiveness. Objective: This study aimed to define the gaps in the evidence for the efficacy of digital education and to identify priority areas where future research has the potential to contribute to our understanding and use of digital education. Methods: We used a 2-stage approach to identify research priorities. First, an umbrella review of the recent literature (published between 2020 and 2023) was performed to identify and build on existing work. Second, expert consensus on the priority research questions was obtained using a modified Delphi method. Results: A total of 8857 potentially relevant papers were identified. Using the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) methodology, we included 217 papers for full review. All papers were either systematic reviews or meta-analyses. A total of 151 research recommendations were extracted from the 217 papers. These were analyzed, recategorized, and consolidated to create a final list of 63 questions. From these, a modified Delphi process with 42 experts was used to produce the top-five rated research priorities: (1) How do we measure the learning transfer from digital education into the clinical setting? (2) How can we optimize the use of artificial intelligence, machine learning, and deep learning to facilitate education and training? (3) What are the methodological requirements for high-quality rigorous studies assessing the outcomes of digital health education? (4) How does the design of digital education interventions (eg, format and modality) in health professionals' education and training curriculum affect learning outcomes? and (5) How should learning outcomes in the field of health professions' digital education be defined and standardized? Conclusions: This review provides a prioritized list of research gaps in digital education in health care, which will be of use to researchers, educators, education providers, and funding agencies. Additional proposals are discussed regarding the next steps needed to advance this agenda, aiming to promote meaningful and practical research on the use of digital technologies and drive excellence in health care education. ", doi="10.2196/66157", url="https://www.jmir.org/2025/1/e66157", url="http://www.ncbi.nlm.nih.gov/pubmed/39969988" } @Article{info:doi/10.2196/66633, author="Chow, L. James C. and Li, Kay", title="Developing Effective Frameworks for Large Language Model--Based Medical Chatbots: Insights From Radiotherapy Education With ChatGPT", journal="JMIR Cancer", year="2025", month="Feb", day="18", volume="11", pages="e66633", keywords="artificial intelligence", keywords="AI", keywords="AI in medical education", keywords="radiotherapy chatbot", keywords="large language models", keywords="LLMs", keywords="medical chatbots", keywords="health care AI", keywords="ethical AI in health care", keywords="personalized learning", keywords="natural language processing", keywords="NLP", keywords="radiotherapy education", keywords="AI-driven learning tools", doi="10.2196/66633", url="https://cancer.jmir.org/2025/1/e66633" } @Article{info:doi/10.2196/58766, author="Ichikawa, Tsunagu and Olsen, Elizabeth and Vinod, Arathi and Glenn, Noah and Hanna, Karim and Lund, C. Gregg and Pierce-Talsma, Stacey", title="Generative Artificial Intelligence in Medical Education---Policies and Training at US Osteopathic Medical Schools: Descriptive Cross-Sectional Survey", journal="JMIR Med Educ", year="2025", month="Feb", day="11", volume="11", pages="e58766", keywords="artificial intelligence", keywords="medical education", keywords="faculty development", keywords="policy", keywords="AI", keywords="training", keywords="United States", keywords="school", keywords="university", keywords="college", keywords="institution", keywords="osteopathic", keywords="osteopathy", keywords="curriculum", keywords="student", keywords="faculty", keywords="administrator", keywords="survey", keywords="cross-sectional", abstract="Background: Interest has recently increased in generative artificial intelligence (GenAI), a subset of artificial intelligence that can create new content. Although the publicly available GenAI tools are not specifically trained in the medical domain, they have demonstrated proficiency in a wide range of medical assessments. The future integration of GenAI in medicine remains unknown. However, the rapid availability of GenAI with a chat interface and the potential risks and benefits are the focus of great interest. As with any significant medical advancement or change, medical schools must adapt their curricula to equip students with the skills necessary to become successful physicians. Furthermore, medical schools must ensure that faculty members have the skills to harness these new opportunities to increase their effectiveness as educators. How medical schools currently fulfill their responsibilities is unclear. Colleges of Osteopathic Medicine (COMs) in the United States currently train a significant proportion of the total number of medical students. These COMs are in academic settings ranging from large public research universities to small private institutions. Therefore, studying COMs will offer a representative sample of the current GenAI integration in medical education. Objective: This study aims to describe the policies and training regarding the specific aspect of GenAI in US COMs, targeting students, faculty, and administrators. Methods: Web-based surveys were sent to deans and Student Government Association (SGA) presidents of the main campuses of fully accredited US COMs. The dean survey included questions regarding current and planned policies and training related to GenAI for students, faculty, and administrators. The SGA president survey included only those questions related to current student policies and training. Results: Responses were received from 81\% (26/32) of COMs surveyed. This included 47\% (15/32) of the deans and 50\% (16/32) of the SGA presidents (with 5 COMs represented by both the deans and the SGA presidents). Most COMs did not have a policy on the student use of GenAI, as reported by the dean (14/15, 93\%) and the SGA president (14/16, 88\%). Of the COMs with no policy, 79\% (11/14) had no formal plans for policy development. Only 1 COM had training for students, which focused entirely on the ethics of using GenAI. Most COMs had no formal plans to provide mandatory (11/14, 79\%) or elective (11/15, 73\%) training. No COM had GenAI policies for faculty or administrators. Eighty percent had no formal plans for policy development. Furthermore, 33.3\% (5/15) of COMs had faculty or administrator GenAI training. Except for examination question development, there was no training to increase faculty or administrator capabilities and efficiency or to decrease their workload. Conclusions: The survey revealed that most COMs lack GenAI policies and training for students, faculty, and administrators. The few institutions with policies or training were extremely limited in scope. Most institutions without current training or policies had no formal plans for development. The lack of current policies and training initiatives suggests inadequate preparedness for integrating GenAI into the medical school environment, therefore, relegating the responsibility for ethical guidance and training to the individual COM member. ", doi="10.2196/58766", url="https://mededu.jmir.org/2025/1/e58766" } @Article{info:doi/10.2196/63887, author="Burisch, Christian and Bellary, Abhav and Breuckmann, Frank and Ehlers, Jan and Thal, C. Serge and Sellmann, Timur and G{\"o}dde, Daniel", title="ChatGPT-4 Performance on German Continuing Medical Education---Friend or Foe (Trick or Treat)? Protocol for a Randomized Controlled Trial", journal="JMIR Res Protoc", year="2025", month="Feb", day="6", volume="14", pages="e63887", keywords="ChatGPT", keywords="artificial intelligence", keywords="large language model", keywords="postgraduate education", keywords="continuing medical education", keywords="self-assessment program", abstract="Background: The increasing development and spread of artificial and assistive intelligence is opening up new areas of application not only in applied medicine but also in related fields such as continuing medical education (CME), which is part of the mandatory training program for medical doctors in Germany. This study aimed to determine whether medical laypersons can successfully conduct training courses specifically for physicians with the help of a large language model (LLM) such as ChatGPT-4. This study aims to qualitatively and quantitatively investigate the impact of using artificial intelligence (AI; specifically ChatGPT) on the acquisition of credit points in German postgraduate medical education. Objective: Using this approach, we wanted to test further possible applications of AI in the postgraduate medical education setting and obtain results for practical use. Depending on the results, the potential influence of LLMs such as ChatGPT-4 on CME will be discussed, for example, as part of a SWOT (strengths, weaknesses, opportunities, threats) analysis. Methods: We designed a randomized controlled trial, in which adult high school students attempt to solve CME tests across six medical specialties in three study arms in total with 18 CME training courses per study arm under different interventional conditions with varying amounts of permitted use of ChatGPT-4. Sample size calculation was performed including guess probability (20\% correct answers, SD=40\%; confidence level of 1--$\alpha$=.95/$\alpha$=.05; test power of 1--$\beta$=.95; P<.05). The study was registered at open scientific framework. Results: As of October 2024, the acquisition of data and students to participate in the trial is ongoing. Upon analysis of our acquired data, we predict our findings to be ready for publication as soon as early 2025. Conclusions: We aim to prove that the advances in AI, especially LLMs such as ChatGPT-4 have considerable effects on medical laypersons' ability to successfully pass CME tests. The implications that this holds on how the concept of continuous medical education requires reevaluation are yet to be contemplated. Trial Registration: OSF Registries 10.17605/OSF.IO/MZNUF; https://osf.io/mznuf International Registered Report Identifier (IRRID): PRR1-10.2196/63887 ", doi="10.2196/63887", url="https://www.researchprotocols.org/2025/1/e63887" } @Article{info:doi/10.2196/58161, author="Gazquez-Garcia, Javier and S{\'a}nchez-Bocanegra, Luis Carlos and Sevillano, Luis Jose", title="AI in the Health Sector: Systematic Review of Key Skills for Future Health Professionals", journal="JMIR Med Educ", year="2025", month="Feb", day="5", volume="11", pages="e58161", keywords="artificial intelligence", keywords="healthcare competencies", keywords="systematic review", keywords="healthcare education", keywords="AI regulation", abstract="Background: Technological advancements have significantly reshaped health care, introducing digital solutions that enhance diagnostics and patient care. Artificial intelligence (AI) stands out, offering unprecedented capabilities in data analysis, diagnostic support, and personalized medicine. However, effectively integrating AI into health care necessitates specialized competencies among professionals, an area still in its infancy in terms of comprehensive literature and formalized training programs. Objective: This systematic review aims to consolidate the essential skills and knowledge health care professionals need to integrate AI into their clinical practice effectively, according to the published literature. Methods: We conducted a systematic review, across databases PubMed, Scopus, and Web of Science, of peer-reviewed literature that directly explored the required skills for health care professionals to integrate AI into their practice, published in English or Spanish from 2018 onward. Studies that did not refer to specific skills or training in digital health were not included, discarding those that did not directly contribute to understanding the competencies necessary to integrate AI into health care practice. Bias in the examined works was evaluated following Cochrane's domain-based recommendations. Results: The initial database search yielded a total of 2457 articles. After deleting duplicates and screening titles and abstracts, 37 articles were selected for full-text review. Out of these, only 7 met all the inclusion criteria for this systematic review. The review identified a diverse range of skills and competencies, that we categorized into 14 key areas classified based on their frequency of appearance in the selected studies, including AI fundamentals, data analytics and management, and ethical considerations. Conclusions: Despite the broadening of search criteria to capture the evolving nature of AI in health care, the review underscores a significant gap in focused studies on the required competencies. Moreover, the review highlights the critical role of regulatory bodies such as the US Food and Drug Administration in facilitating the adoption of AI technologies by establishing trust and standardizing algorithms. Key areas were identified for developing competencies among health care professionals for the implementation of AI, including: AI fundamentals knowledge (more focused on assessing the accuracy, reliability, and validity of AI algorithms than on more technical abilities such as programming or mathematics), data analysis skills (including data acquisition, cleaning, visualization, management, and governance), and ethical and legal considerations. In an AI-enhanced health care landscape, the ability to humanize patient care through effective communication is paramount. This balance ensures that while AI streamlines tasks and potentially increases patient interaction time, health care professionals maintain a focus on compassionate care, thereby leveraging AI to enhance, rather than detract from, the patient experience.\emspace ", doi="10.2196/58161", url="https://mededu.jmir.org/2025/1/e58161" } @Article{info:doi/10.2196/63065, author="Elhassan, Elwaleed Safia and Sajid, Raihan Muhammad and Syed, Mariam Amina and Fathima, Afreen Sidrah and Khan, Shehroz Bushra and Tamim, Hala", title="Assessing Familiarity, Usage Patterns, and Attitudes of Medical Students Toward ChatGPT and Other Chat-Based AI Apps in Medical Education: Cross-Sectional Questionnaire Study", journal="JMIR Med Educ", year="2025", month="Jan", day="30", volume="11", pages="e63065", keywords="ChatGPT", keywords="artificial intelligence", keywords="large language model", keywords="medical students", keywords="ethics", keywords="chat-based", keywords="AI apps", keywords="medical education", keywords="social media", keywords="attitude", keywords="AI", abstract="Background: There has been a rise in the popularity of ChatGPT and other chat-based artificial intelligence (AI) apps in medical education. Despite data being available from other parts of the world, there is a significant lack of information on this topic in medical education and research, particularly in Saudi Arabia. Objective: The primary objective of the study was to examine the familiarity, usage patterns, and attitudes of Alfaisal University medical students toward ChatGPT and other chat-based AI apps in medical education. Methods: This was a cross-sectional study conducted from October 8, 2023, through November 22, 2023. A questionnaire was distributed through social media channels to medical students at Alfaisal University who were 18 years or older. Current Alfaisal University medical students in years 1 through 6, of both genders, were exclusively targeted by the questionnaire. The study was approved by Alfaisal University Institutional Review Board. A $\chi$2 test was conducted to assess the relationships between gender, year of study, familiarity, and reasons for usage. Results: A total of 293 responses were received, of which 95 (32.4\%) were from men and 198 (67.6\%) were from women. There were 236 (80.5\%) responses from preclinical students and 57 (19.5\%) from clinical students, respectively. Overall, males (n=93, 97.9\%) showed more familiarity with ChatGPT compared to females (n=180, 90.09\%; P=.03). Additionally, males also used Google Bard and Microsoft Bing ChatGPT more than females (P<.001). Clinical-year students used ChatGPT significantly more for general writing purposes compared to preclinical students (P=.005). Additionally, 136 (46.4\%) students believed that using ChatGPT and other chat-based AI apps for coursework was ethical, 86 (29.4\%) were neutral, and 71 (24.2\%) considered it unethical (all Ps>.05). Conclusions: Familiarity with and usage of ChatGPT and other chat-based AI apps were common among the students of Alfaisal University. The usage patterns of these apps differ between males and females and between preclinical and clinical-year students. ", doi="10.2196/63065", url="https://mededu.jmir.org/2025/1/e63065" } @Article{info:doi/10.2196/63775, author="Li, Rui and Wu, Tong", title="Evolution of Artificial Intelligence in Medical Education From 2000 to 2024: Bibliometric Analysis", journal="Interact J Med Res", year="2025", month="Jan", day="30", volume="14", pages="e63775", keywords="artificial intelligence", keywords="medical education", keywords="bibliometric", keywords="citation trends", keywords="academic pattern", keywords="VOSviewer", keywords="Citespace", keywords="AI", abstract="Background: Incorporating artificial intelligence (AI) into medical education has gained significant attention for its potential to enhance teaching and learning outcomes. However, it lacks a comprehensive study depicting the academic performance and status of AI in the medical education domain. Objective: This study aims to analyze the social patterns, productive contributors, knowledge structure, and clusters since the 21st century. Methods: Documents were retrieved from the Web of Science Core Collection database from 2000 to 2024. VOSviewer, Incites, and Citespace were used to analyze the bibliometric metrics, which were categorized by country, institution, authors, journals, and keywords. The variables analyzed encompassed counts, citations, H-index, impact factor, and collaboration metrics. Results: Altogether, 7534 publications were initially retrieved and 2775 were included for analysis. The annual count and citation of papers exhibited exponential trends since 2018. The United States emerged as the lead contributor due to its high productivity and recognition levels. Stanford University, Johns Hopkins University, National University of Singapore, Mayo Clinic, University of Arizona, and University of Toronto were representative institutions in their respective fields. Cureus, JMIR Medical Education, Medical Teacher, and BMC Medical Education ranked as the top four most productive journals. The resulting heat map highlighted several high-frequency keywords, including performance, education, AI, and model. The citation burst time of terms revealed that AI technologies shifted from imaging processing (2000), augmented reality (2013), and virtual reality (2016) to decision-making (2020) and model (2021). Keywords such as mortality and robotic surgery persisted into 2023, suggesting the ongoing recognition and interest in these areas. Conclusions: This study provides valuable insights and guidance for researchers who are interested in educational technology, as well as recommendations for pioneering institutions and journal submissions. Along with the rapid growth of AI, medical education is expected to gain much more benefits. ", doi="10.2196/63775", url="https://www.i-jmr.org/2025/1/e63775" } @Article{info:doi/10.2196/67197, author="Taira, Kazuya and Itaya, Takahiro and Yada, Shuntaro and Hiyama, Kirara and Hanada, Ayame", title="Impact of Attached File Formats on the Performance of ChatGPT-4 on the Japanese National Nursing Examination: Evaluation Study", journal="JMIR Nursing", year="2025", month="Jan", day="22", volume="8", pages="e67197", keywords="nursing examination", keywords="machine learning", keywords="ML", keywords="artificial intelligence", keywords="AI", keywords="large language models", keywords="ChatGPT", keywords="generative AI", abstract="Abstract: This research letter discusses the impact of different file formats on ChatGPT-4's performance on the Japanese National Nursing Examination, highlighting the need for standardized reporting protocols to enhance the integration of artificial intelligence in nursing education and practice. ", doi="10.2196/67197", url="https://nursing.jmir.org/2025/1/e67197" } @Article{info:doi/10.2196/64284, author="Wei, Boxiong", title="Performance Evaluation and Implications of Large Language Models in Radiology Board Exams: Prospective Comparative Analysis", journal="JMIR Med Educ", year="2025", month="Jan", day="16", volume="11", pages="e64284", keywords="large language models", keywords="LLM", keywords="artificial intelligence", keywords="AI", keywords="GPT-4", keywords="radiology exams", keywords="medical education", keywords="diagnostics", keywords="medical training", keywords="radiology", keywords="ultrasound", abstract="Background: Artificial intelligence advancements have enabled large language models to significantly impact radiology education and diagnostic accuracy. Objective: This study evaluates the performance of mainstream large language models, including GPT-4, Claude, Bard, Tongyi Qianwen, and Gemini Pro, in radiology board exams. Methods: A comparative analysis of 150 multiple-choice questions from radiology board exams without images was conducted. Models were assessed on their accuracy for text-based questions and were categorized by cognitive levels and medical specialties using $\chi$2 tests and ANOVA. Results: GPT-4 achieved the highest accuracy (83.3\%, 125/150), significantly outperforming all other models. Specifically, Claude achieved an accuracy of 62\% (93/150; P<.001), Bard 54.7\% (82/150; P<.001), Tongyi Qianwen 70.7\% (106/150; P=.009), and Gemini Pro 55.3\% (83/150; P<.001). The odds ratios compared to GPT-4 were 0.33 (95\% CI 0.18?0.60) for Claude, 0.24 (95\% CI 0.13?0.44) for Bard, and 0.25 (95\% CI 0.14?0.45) for Gemini Pro. Tongyi Qianwen performed relatively well with an accuracy of 70.7\% (106/150; P=0.02) and had an odds ratio of 0.48 (95\% CI 0.27?0.87) compared to GPT-4. Performance varied across question types and specialties, with GPT-4 excelling in both lower-order and higher-order questions, while Claude and Bard struggled with complex diagnostic questions. Conclusions: GPT-4 and Tongyi Qianwen show promise in medical education and training. The study emphasizes the need for domain-specific training datasets to enhance large language models' effectiveness in specialized fields like radiology. ", doi="10.2196/64284", url="https://mededu.jmir.org/2025/1/e64284" } @Article{info:doi/10.2196/51319, author="Kim, JaeYong and Vajravelu, Narayan Bathri", title="Assessing the Current Limitations of Large Language Models in Advancing Health Care Education", journal="JMIR Form Res", year="2025", month="Jan", day="16", volume="9", pages="e51319", keywords="large language model", keywords="generative pretrained transformer", keywords="health care education", keywords="health care delivery", keywords="artificial intelligence", keywords="LLM", keywords="ChatGPT", keywords="AI", doi="10.2196/51319", url="https://formative.jmir.org/2025/1/e51319" } @Article{info:doi/10.2196/58898, author="Kaewboonlert, Naritsaret and Poontananggul, Jiraphon and Pongsuwan, Natthipong and Bhakdisongkhram, Gun", title="Factors Associated With the Accuracy of Large Language Models in Basic Medical Science Examinations: Cross-Sectional Study", journal="JMIR Med Educ", year="2025", month="Jan", day="13", volume="11", pages="e58898", keywords="accuracy", keywords="performance", keywords="artificial intelligence", keywords="AI", keywords="ChatGPT", keywords="large language model", keywords="LLM", keywords="difficulty index", keywords="basic medical science examination", keywords="cross-sectional study", keywords="medical education", keywords="datasets", keywords="assessment", keywords="medical science", keywords="tool", keywords="Google", abstract="Background: Artificial intelligence (AI) has become widely applied across many fields, including medical education. Content validation and its answers are based on training datasets and the optimization of each model. The accuracy of large language model (LLMs) in basic medical examinations and factors related to their accuracy have also been explored. Objective: We evaluated factors associated with the accuracy of LLMs (GPT-3.5, GPT-4, Google Bard, and Microsoft Bing) in answering multiple-choice questions from basic medical science examinations. Methods: We used questions that were closely aligned with the content and topic distribution of Thailand's Step 1 National Medical Licensing Examination. Variables such as the difficulty index, discrimination index, and question characteristics were collected. These questions were then simultaneously input into ChatGPT (with GPT-3.5 and GPT-4), Microsoft Bing, and Google Bard, and their responses were recorded. The accuracy of these LLMs and the associated factors were analyzed using multivariable logistic regression. This analysis aimed to assess the effect of various factors on model accuracy, with results reported as odds ratios (ORs). Results: The study revealed that GPT-4 was the top-performing model, with an overall accuracy of 89.07\% (95\% CI 84.76\%?92.41\%), significantly outperforming the others (P<.001). Microsoft Bing followed with an accuracy of 83.69\% (95\% CI 78.85\%?87.80\%), GPT-3.5 at 67.02\% (95\% CI 61.20\%?72.48\%), and Google Bard at 63.83\% (95\% CI 57.92\%?69.44\%). The multivariable logistic regression analysis showed a correlation between question difficulty and model performance, with GPT-4 demonstrating the strongest association. Interestingly, no significant correlation was found between model accuracy and question length, negative wording, clinical scenarios, or the discrimination index for most models, except for Google Bard, which showed varying correlations. Conclusions: The GPT-4 and Microsoft Bing models demonstrated equal and superior accuracy compared to GPT-3.5 and Google Bard in the domain of basic medical science. The accuracy of these models was significantly influenced by the item's difficulty index, indicating that the LLMs are more accurate when answering easier questions. This suggests that the more accurate models, such as GPT-4 and Bing, can be valuable tools for understanding and learning basic medical science concepts. ", doi="10.2196/58898", url="https://mededu.jmir.org/2025/1/e58898" } @Article{info:doi/10.2196/62669, author="Rjoop, Anwar and Al-Qudah, Mohammad and Alkhasawneh, Raja and Bataineh, Nesreen and Abdaljaleel, Maram and Rjoub, A. Moayad and Alkhateeb, Mustafa and Abdelraheem, Mohammad and Al-Omari, Salem and Bani-Mari, Omar and Alkabalan, Anas and Altulaih, Saoud and Rjoub, Iyad and Alshimi, Rula", title="Awareness and Attitude Toward Artificial Intelligence Among Medical Students and Pathology Trainees: Survey Study", journal="JMIR Med Educ", year="2025", month="Jan", day="10", volume="11", pages="e62669", keywords="artificial intelligence", keywords="AI", keywords="deep learning", keywords="medical schools", keywords="pathology", keywords="Jordan", keywords="medical education", keywords="awareness", keywords="attitude", keywords="medical students", keywords="pathology trainees", keywords="national survey study", keywords="medical practice", keywords="training", keywords="web-based survey", keywords="survey", keywords="questionnaire", abstract="Background: Artificial intelligence (AI) is set to shape the future of medical practice. The perspective and understanding of medical students are critical for guiding the development of educational curricula and training. Objective: This study aims to assess and compare medical AI-related attitudes among medical students in general medicine and in one of the visually oriented fields (pathology), along with illuminating their anticipated role of AI in the rapidly evolving landscape of AI-enhanced health care. Methods: This was a cross-sectional study that used a web-based survey composed of a closed-ended questionnaire. The survey addressed medical students at all educational levels across the 5 public medical schools, along with pathology residents in 4 residency programs in Jordan. Results: A total of 394 respondents participated (328 medical students and 66 pathology residents). The majority of respondents (272/394, 69\%) were already aware of AI and deep learning in medicine, mainly relying on websites for information on AI, while only 14\% (56/394) were aware of AI through medical schools. There was a statistically significant difference in awareness among respondents who consider themselves tech experts compared with those who do not (P=.03). More than half of the respondents believed that AI could be used to diagnose diseases automatically (213/394, 54.1\% agreement), with medical students agreeing more than pathology residents (P=.04). However, more than one-third expressed fear about recent AI developments (167/394, 42.4\% agreed). Two-thirds of respondents disagreed that their medical schools had educated them about AI and its potential use (261/394, 66.2\% disagreed), while 46.2\% (182/394) expressed interest in learning about AI in medicine. In terms of pathology-specific questions, 75.4\% (297/394) agreed that AI could be used to identify pathologies in slide examinations automatically. There was a significant difference between medical students and pathology residents in their agreement (P=.001). Overall, medical students and pathology trainees had similar responses. Conclusions: AI education should be introduced into medical school curricula to improve medical students' understanding and attitudes. Students agreed that they need to learn about AI's applications, potential hazards, and legal and ethical implications. This is the first study to analyze medical students' views and awareness of AI in Jordan, as well as the first to include pathology residents' perspectives. The findings are consistent with earlier research internationally. In comparison with prior research, these attitudes are similar in low-income and industrialized countries, highlighting the need for a global strategy to introduce AI instruction to medical students everywhere in this era of rapidly expanding technology. ", doi="10.2196/62669", url="https://mededu.jmir.org/2025/1/e62669" } @Article{info:doi/10.2196/63731, author="Zhu, Shiben and Hu, Wanqin and Yang, Zhi and Yan, Jiani and Zhang, Fang", title="Qwen-2.5 Outperforms Other Large Language Models in the Chinese National Nursing Licensing Examination: Retrospective Cross-Sectional Comparative Study", journal="JMIR Med Inform", year="2025", month="Jan", day="10", volume="13", pages="e63731", keywords="large language models", keywords="LLMs", keywords="Chinese National Nursing Licensing Examination", keywords="ChatGPT", keywords="Qwen-2.5", keywords="multiple-choice questions", keywords="", abstract="Background: Large language models (LLMs) have been proposed as valuable tools in medical education and practice. The Chinese National Nursing Licensing Examination (CNNLE) presents unique challenges for LLMs due to its requirement for both deep domain--specific nursing knowledge and the ability to make complex clinical decisions, which differentiates it from more general medical examinations. However, their potential application in the CNNLE remains unexplored. Objective: This study aims to evaluates the accuracy of 7 LLMs including GPT-3.5, GPT-4.0, GPT-4o, Copilot, ERNIE Bot-3.5, SPARK, and Qwen-2.5 on the CNNLE, focusing on their ability to handle domain-specific nursing knowledge and clinical decision-making. We also explore whether combining their outputs using machine learning techniques can improve their overall accuracy. Methods: This retrospective cross-sectional study analyzed all 1200 multiple-choice questions from the CNNLE conducted between 2019 and 2023. Seven LLMs were evaluated on these multiple-choice questions, and 9 machine learning models, including Logistic Regression, Support Vector Machine, Multilayer Perceptron, k-nearest neighbors, Random Forest, LightGBM, AdaBoost, XGBoost, and CatBoost, were used to optimize overall performance through ensemble techniques. Results: Qwen-2.5 achieved the highest overall accuracy of 88.9\%, followed by GPT-4o (80.7\%), ERNIE Bot-3.5 (78.1\%), GPT-4.0 (70.3\%), SPARK (65.0\%), and GPT-3.5 (49.5\%). Qwen-2.5 demonstrated superior accuracy in the Practical Skills section compared with the Professional Practice section across most years. It also performed well in brief clinical case summaries and questions involving shared clinical scenarios. When the outputs of the 7 LLMs were combined using 9 machine learning models, XGBoost yielded the best performance, increasing accuracy to 90.8\%. XGBoost also achieved an area under the curve of 0.961, sensitivity of 0.905, specificity of 0.978, F1-score of 0.901, positive predictive value of 0.901, and negative predictive value of 0.977. Conclusions: This study is the first to evaluate the performance of 7 LLMs on the CNNLE and that the integration of models via machine learning significantly boosted accuracy, reaching 90.8\%. These findings demonstrate the transformative potential of LLMs in revolutionizing health care education and call for further research to refine their capabilities and expand their impact on examination preparation and professional training. ", doi="10.2196/63731", url="https://medinform.jmir.org/2025/1/e63731" } @Article{info:doi/10.2196/63924, author="Zhang, Yong and Lu, Xiao and Luo, Yan and Zhu, Ying and Ling, Wenwu", title="Performance of Artificial Intelligence Chatbots on Ultrasound Examinations: Cross-Sectional Comparative Analysis", journal="JMIR Med Inform", year="2025", month="Jan", day="9", volume="13", pages="e63924", keywords="chatbots", keywords="ChatGPT", keywords="ERNIE Bot", keywords="performance", keywords="accuracy rates", keywords="ultrasound", keywords="language", keywords="examination", abstract="Background: Artificial intelligence chatbots are being increasingly used for medical inquiries, particularly in the field of ultrasound medicine. However, their performance varies and is influenced by factors such as language, question type, and topic. Objective: This study aimed to evaluate the performance of ChatGPT and ERNIE Bot in answering ultrasound-related medical examination questions, providing insights for users and developers. Methods: We curated 554 questions from ultrasound medicine examinations, covering various question types and topics. The questions were posed in both English and Chinese. Objective questions were scored based on accuracy rates, whereas subjective questions were rated by 5 experienced doctors using a Likert scale. The data were analyzed in Excel. Results: Of the 554 questions included in this study, single-choice questions comprised the largest share (354/554, 64\%), followed by short answers (69/554, 12\%) and noun explanations (63/554, 11\%). The accuracy rates for objective questions ranged from 8.33\% to 80\%, with true or false questions scoring highest. Subjective questions received acceptability rates ranging from 47.62\% to 75.36\%. ERNIE Bot was superior to ChatGPT in many aspects (P<.05). Both models showed a performance decline in English, but ERNIE Bot's decline was less significant. The models performed better in terms of basic knowledge, ultrasound methods, and diseases than in terms of ultrasound signs and diagnosis. Conclusions: Chatbots can provide valuable ultrasound-related answers, but performance differs by model and is influenced by language, question type, and topic. In general, ERNIE Bot outperforms ChatGPT. Users and developers should understand model performance characteristics and select appropriate models for different questions and languages to optimize chatbot use. ", doi="10.2196/63924", url="https://medinform.jmir.org/2025/1/e63924" } @Article{info:doi/10.2196/63865, author="Bland, Tyler", title="Enhancing Medical Student Engagement Through Cinematic Clinical Narratives: Multimodal Generative AI--Based Mixed Methods Study", journal="JMIR Med Educ", year="2025", month="Jan", day="6", volume="11", pages="e63865", keywords="artificial intelligence", keywords="cinematic clinical narratives", keywords="cinemeducation", keywords="medical education", keywords="narrative learning", keywords="AI", keywords="medical student", keywords="pharmacology", keywords="preclinical education", keywords="long-term retention", keywords="AI tools", keywords="GPT-4", keywords="image", keywords="applicability", abstract="Background: Medical students often struggle to engage with and retain complex pharmacology topics during their preclinical education. Traditional teaching methods can lead to passive learning and poor long-term retention of critical concepts. Objective: This study aims to enhance the teaching of clinical pharmacology in medical school by using a multimodal generative artificial intelligence (genAI) approach to create compelling, cinematic clinical narratives (CCNs). Methods: We transformed a standard clinical case into an engaging, interactive multimedia experience called ``Shattered Slippers.'' This CCN used various genAI tools for content creation: GPT-4 for developing the storyline, Leonardo.ai and Stable Diffusion for generating images, Eleven Labs for creating audio narrations, and Suno for composing a theme song. The CCN integrated narrative styles and pop culture references to enhance student engagement. It was applied in teaching first-year medical students about immune system pharmacology. Student responses were assessed through the Situational Interest Survey for Multimedia and examination performance. The target audience comprised first-year medical students (n=40), with 18 responding to the Situational Interest Survey for Multimedia survey (n=18). Results: The study revealed a marked preference for the genAI-enhanced CCNs over traditional teaching methods. Key findings include the majority of surveyed students preferring the CCN over traditional clinical cases (14/18), as well as high average scores for triggered situational interest (mean 4.58, SD 0.53), maintained interest (mean 4.40, SD 0.53), maintained-feeling interest (mean 4.38, SD 0.51), and maintained-value interest (mean 4.42, SD 0.54). Students achieved an average score of 88\% on examination questions related to the CCN material, indicating successful learning and retention. Qualitative feedback highlighted increased engagement, improved recall, and appreciation for the narrative style and pop culture references. Conclusions: This study demonstrates the potential of using a multimodal genAI-driven approach to create CCNs in medical education. The ``Shattered Slippers'' case effectively enhanced student engagement and promoted knowledge retention in complex pharmacological topics. This innovative method suggests a novel direction for curriculum development that could improve learning outcomes and student satisfaction in medical education. Future research should explore the long-term retention of knowledge and the applicability of learned material in clinical settings, as well as the potential for broader implementation of this approach across various medical education contexts. ", doi="10.2196/63865", url="https://mededu.jmir.org/2025/1/e63865" } @Article{info:doi/10.2196/58426, author="Wang, Heng and Zheng, Danni and Wang, Mengying and Ji, Hong and Han, Jiangli and Wang, Yan and Shen, Ning and Qiao, Jie", title="Artificial Intelligence--Powered Training Database for Clinical Thinking: App Development Study", journal="JMIR Form Res", year="2025", month="Jan", day="3", volume="9", pages="e58426", keywords="artificial intelligence", keywords="clinical thinking ability", keywords="virtual medical records", keywords="distance education", keywords="medical education", keywords="online learning", abstract="Background: With the development of artificial intelligence (AI), medicine has entered the era of intelligent medicine, and various aspects, such as medical education and talent cultivation, are also being redefined. The cultivation of clinical thinking abilities poses a formidable challenge even for seasoned clinical educators, as offline training modalities often fall short in bridging the divide between current practice and the desired ideal. Consequently, there arises an imperative need for the expeditious development of a web-based database, tailored to empower physicians in their quest to learn and hone their clinical reasoning skills. Objective: This study aimed to introduce an app named ``XueYiKu,'' which includes consultations, physical examinations, auxiliary examinations, and diagnosis, incorporating AI and actual complete hospital medical records to build an online-learning platform using human-computer interaction. Methods: The ``XueYiKu'' app was designed as a contactless, self-service, trial-and-error system application based on actual complete hospital medical records and natural language processing technology to comprehensively assess the ``clinical competence'' of residents at different stages. Case extraction was performed at a hospital's case data center, and the best-matching cases were differentiated through natural language processing, word segmentation, synonym conversion, and sorting. More than 400 teaching cases covering 65 kinds of diseases were released for students to learn, and the subjects covered internal medicine, surgery, gynecology and obstetrics, and pediatrics. The difficulty of learning cases was divided into four levels in ascending order. Moreover, the learning and teaching effects were evaluated using 6 dimensions covering systematicness, agility, logic, knowledge expansion, multidimensional evaluation indicators, and preciseness. Results: From the app's first launch on the Android platform in May 2019 to the last version updated in May 2023, the total number of teacher and student users was 6209 and 1180, respectively. The top 3 subjects most frequently learned were respirology (n=606, 24.1\%), general surgery (n=506, 20.1\%), and urinary surgery (n=390, 15.5\%). For diseases, pneumonia was the most frequently learned, followed by cholecystolithiasis (n=216, 14.1\%), benign prostate hyperplasia (n=196, 12.8\%), and bladder tumor (n=193, 12.6\%). Among 479 students, roughly a third (n=168, 35.1\%) scored in the 60 to 80 range, and half of them scored over 80 points (n=238, 49.7\%). The app enabled medical students' learning to become more active and self-motivated, with a variety of formats, and provided real-time feedback through assessments on the platform. The learning effect was satisfactory overall and provided important precedence for establishing scientific models and methods for assessing clinical thinking skills in the future. Conclusions: The integration of AI and medical education will undoubtedly assist in the restructuring of education processes; promote the evolution of the education ecosystem; and provide new convenient ways for independent learning, interactive communication, and educational resource sharing. ", doi="10.2196/58426", url="https://formative.jmir.org/2025/1/e58426" } @Article{info:doi/10.2196/59435, author="Wang, Chenxu and Li, Shuhan and Lin, Nuoxi and Zhang, Xinyu and Han, Ying and Wang, Xiandi and Liu, Di and Tan, Xiaomei and Pu, Dan and Li, Kang and Qian, Guangwu and Yin, Rong", title="Application of Large Language Models in Medical Training Evaluation---Using ChatGPT as a Standardized Patient: Multimetric Assessment", journal="J Med Internet Res", year="2025", month="Jan", day="1", volume="27", pages="e59435", keywords="ChatGPT", keywords="artificial intelligence", keywords="standardized patient", keywords="health care", keywords="prompt engineering", keywords="accuracy", keywords="large language models", keywords="performance evaluation", keywords="medical training", keywords="inflammatory bowel disease", abstract="Background: With the increasing interest in the application of large language models (LLMs) in the medical field, the feasibility of its potential use as a standardized patient in medical assessment is rarely evaluated. Specifically, we delved into the potential of using ChatGPT, a representative LLM, in transforming medical education by serving as a cost-effective alternative to standardized patients, specifically for history-taking tasks. Objective: The study aims to explore ChatGPT's viability and performance as a standardized patient, using prompt engineering to refine its accuracy and use in medical assessments. Methods: A 2-phase experiment was conducted. The first phase assessed feasibility by simulating conversations about inflammatory bowel disease (IBD) across 3 quality groups (good, medium, and bad). Responses were categorized based on their relevance and accuracy. Each group consisted of 30 runs, with responses scored to determine whether they were related to the inquiries. For the second phase, we evaluated ChatGPT's performance against specific criteria, focusing on its anthropomorphism, clinical accuracy, and adaptability. Adjustments were made to prompts based on ChatGPT's response shortcomings, with a comparative analysis of ChatGPT's performance between original and revised prompts. A total of 300 runs were conducted and compared against standard reference scores. Finally, the generalizability of the revised prompt was tested using other scripts for another 60 runs, together with the exploration of the impact of the used language on the performance of the chatbot. Results: The feasibility test confirmed ChatGPT's ability to simulate a standardized patient effectively, differentiating among poor, medium, and good medical inquiries with varying degrees of accuracy. Score differences between the poor (74.7, SD 5.44) and medium (82.67, SD 5.30) inquiry groups (P<.001), between the poor and good (85, SD 3.27) inquiry groups (P<.001) were significant at a significance level ($\alpha$) of .05, while the score differences between the medium and good inquiry groups were not statistically significant (P=.16). The revised prompt significantly improved ChatGPT's realism, clinical accuracy, and adaptability, leading to a marked reduction in scoring discrepancies. The score accuracy of ChatGPT improved 4.926 times compared to unrevised prompts. The score difference percentage drops from 29.83\% to 6.06\%, with a drop in SD from 0.55 to 0.068. The performance of the chatbot on a separate script is acceptable with an average score difference percentage of 3.21\%. Moreover, the performance differences between test groups using various language combinations were found to be insignificant. Conclusions: ChatGPT, as a representative LLM, is a viable tool for simulating standardized patients in medical assessments, with the potential to enhance medical training. By incorporating proper prompts, ChatGPT's scoring accuracy and response realism significantly improved, approaching the feasibility of actual clinical use. Also, the influence of the adopted language is nonsignificant on the outcome of the chatbot. ", doi="10.2196/59435", url="https://www.jmir.org/2025/1/e59435" } @Article{info:doi/10.2196/63129, author="Miyazaki, Yuki and Hata, Masahiro and Omori, Hisaki and Hirashima, Atsuya and Nakagawa, Yuta and Eto, Mitsuhiro and Takahashi, Shun and Ikeda, Manabu", title="Performance of ChatGPT-4o on the Japanese Medical Licensing Examination: Evalution of Accuracy in Text-Only and Image-Based Questions", journal="JMIR Med Educ", year="2024", month="Dec", day="24", volume="10", pages="e63129", keywords="medical education", keywords="artificial intelligence", keywords="clinical decision-making", keywords="GPT-4o", keywords="medical licensing examination", keywords="Japan", keywords="images", keywords="accuracy", keywords="AI technology", keywords="application", keywords="decision-making", keywords="image-based", keywords="reliability", keywords="ChatGPT", doi="10.2196/63129", url="https://mededu.jmir.org/2024/1/e63129" } @Article{info:doi/10.2196/60312, author="Ogundiya, Oluwadamilola and Rahman, Jasmine Thahmina and Valnarov-Boulter, Ioan and Young, Michael Tim", title="Looking Back on Digital Medical Education Over the Last 25 Years and Looking to the Future: Narrative Review", journal="J Med Internet Res", year="2024", month="Dec", day="19", volume="26", pages="e60312", keywords="digital health", keywords="digital medical education", keywords="health education", keywords="medical education", keywords="mobile phone", keywords="artificial intelligence", keywords="AI", abstract="Background: The last 25 years have seen enormous progression in digital technologies across the whole of the health service, including health education. The rapid evolution and use of web-based and digital techniques have been significantly transforming this field since the beginning of the new millennium. These advancements continue to progress swiftly, even more so after the COVID-19 pandemic. Objective: This narrative review aims to outline and discuss the developments that have taken place in digital medical education across the defined time frame. In addition, evidence for potential opportunities and challenges facing digital medical education in the near future was collated for analysis. Methods: Literature reviews were conducted using PubMed, Web of Science Core Collection, Scopus, Google Scholar, and Embase. The participants and learners in this study included medical students, physicians in training or continuing professional development, nurses, paramedics, and patients. Results: Evidence of the significant steps in the development of digital medical education in the past 25 years was presented and analyzed in terms of application, impact, and implications for the future. The results were grouped into the following themes for discussion: learning management systems; telemedicine (in digital medical education); mobile health; big data analytics; the metaverse, augmented reality, and virtual reality; the COVID-19 pandemic; artificial intelligence; and ethics and cybersecurity. Conclusions: Major changes and developments in digital medical education have occurred from around the start of the new millennium. Key steps in this journey include technical developments in teleconferencing and learning management systems, along with a marked increase in mobile device use for accessing learning over this time. While the pace of evolution in digital medical education accelerated during the COVID-19 pandemic, further rapid progress has continued since the resolution of the pandemic. Many of these changes are currently being widely used in health education and other fields, such as augmented reality, virtual reality, and artificial intelligence, providing significant future potential. The opportunities these technologies offer must be balanced against the associated challenges in areas such as cybersecurity, the integrity of web-based assessments, ethics, and issues of digital privacy to ensure that digital medical education continues to thrive in the future. ", doi="10.2196/60312", url="https://www.jmir.org/2024/1/e60312" } @Article{info:doi/10.2196/57592, author="Roos, Jonas and Martin, Ron and Kaczmarczyk, Robert", title="Evaluating Bard Gemini Pro and GPT-4 Vision Against Student Performance in Medical Visual Question Answering: Comparative Case Study", journal="JMIR Form Res", year="2024", month="Dec", day="17", volume="8", pages="e57592", keywords="medical education", keywords="visual question answering", keywords="image analysis", keywords="large language model", keywords="LLM", keywords="student", keywords="performance", keywords="comparative", keywords="case study", keywords="artificial intelligence", keywords="AI", keywords="ChatGPT", keywords="effectiveness", keywords="diagnostic", keywords="training", keywords="accuracy", keywords="utility", keywords="image-based", keywords="question", keywords="image", keywords="AMBOSS", keywords="English", keywords="German", keywords="question and answer", keywords="Python", keywords="AI in health care", keywords="health care", abstract="Background: The rapid development of large language models (LLMs) such as OpenAI's ChatGPT has significantly impacted medical research and education. These models have shown potential in fields ranging from radiological imaging interpretation to medical licensing examination assistance. Recently, LLMs have been enhanced with image recognition capabilities. Objective: This study aims to critically examine the effectiveness of these LLMs in medical diagnostics and training by assessing their accuracy and utility in answering image-based questions from medical licensing examinations. Methods: This study analyzed 1070 image-based multiple-choice questions from the AMBOSS learning platform, divided into 605 in English and 465 in German. Customized prompts in both languages directed the models to interpret medical images and provide the most likely diagnosis. Student performance data were obtained from AMBOSS, including metrics such as the ``student passed mean'' and ``majority vote.'' Statistical analysis was conducted using Python (Python Software Foundation), with key libraries for data manipulation and visualization. Results: GPT-4 1106 Vision Preview (OpenAI) outperformed Bard Gemini Pro (Google), correctly answering 56.9\% (609/1070) of questions compared to Bard's 44.6\% (477/1070), a statistically significant difference ($\chi$2?=32.1, P<.001). However, GPT-4 1106 left 16.1\% (172/1070) of questions unanswered, significantly higher than Bard's 4.1\% (44/1070; $\chi$2?=83.1, P<.001). When considering only answered questions, GPT-4 1106's accuracy increased to 67.8\% (609/898), surpassing both Bard (477/1026, 46.5\%; $\chi$2?=87.7, P<.001) and the student passed mean of 63\% (674/1070, SE 1.48\%; $\chi$2?=4.8, P=.03). Language-specific analysis revealed both models performed better in German than English, with GPT-4 1106 showing greater accuracy in German (282/465, 60.65\% vs 327/605, 54.1\%; $\chi$2?=4.4, P=.04) and Bard Gemini Pro exhibiting a similar trend (255/465, 54.8\% vs 222/605, 36.7\%; $\chi$2?=34.3, P<.001). The student majority vote achieved an overall accuracy of 94.5\% (1011/1070), significantly outperforming both artificial intelligence models (GPT-4 1106: $\chi$2?=408.5, P<.001; Bard Gemini Pro: $\chi$2?=626.6, P<.001). Conclusions: Our study shows that GPT-4 1106 Vision Preview and Bard Gemini Pro have potential in medical visual question-answering tasks and to serve as a support for students. However, their performance varies depending on the language used, with a preference for German. They also have limitations in responding to non-English content. The accuracy rates, particularly when compared to student responses, highlight the potential of these models in medical education, yet the need for further optimization and understanding of their limitations in diverse linguistic contexts remains critical. ", doi="10.2196/57592", url="https://formative.jmir.org/2024/1/e57592" } @Article{info:doi/10.2196/51435, author="Dzuali, Fiatsogbe and Seiger, Kira and Novoa, Roberto and Aleshin, Maria and Teng, Joyce and Lester, Jenna and Daneshjou, Roxana", title="ChatGPT May Improve Access to Language-Concordant Care for Patients With Non--English Language Preferences", journal="JMIR Med Educ", year="2024", month="Dec", day="10", volume="10", pages="e51435", keywords="ChatGPT", keywords="artificial intelligence", keywords="language", keywords="translation", keywords="health care disparity", keywords="natural language model", keywords="survey", keywords="patient education", keywords="preference", keywords="human language", keywords="language-concordant care", doi="10.2196/51435", url="https://mededu.jmir.org/2024/1/e51435" } @Article{info:doi/10.2196/57451, author="Jin, Kyung Hye and Kim, EunYoung", title="Performance of GPT-3.5 and GPT-4 on the Korean Pharmacist Licensing Examination: Comparison Study", journal="JMIR Med Educ", year="2024", month="Dec", day="4", volume="10", pages="e57451", keywords="GPT-3.5", keywords="GPT-4", keywords="Korean", keywords="Korean Pharmacist Licensing Examination", keywords="KPLE", abstract="Background: ChatGPT, a recently developed artificial intelligence chatbot and a notable large language model, has demonstrated improved performance on medical field examinations. However, there is currently little research on its efficacy in languages other than English or in pharmacy-related examinations. Objective: This study aimed to evaluate the performance of GPT models on the Korean Pharmacist Licensing Examination (KPLE). Methods: We evaluated the percentage of correct answers provided by 2 different versions of ChatGPT (GPT-3.5 and GPT-4) for all multiple-choice single-answer KPLE questions, excluding image-based questions. In total, 320, 317, and 323 questions from the 2021, 2022, and 2023 KPLEs, respectively, were included in the final analysis, which consisted of 4 units: Biopharmacy, Industrial Pharmacy, Clinical and Practical Pharmacy, and Medical Health Legislation. Results: The 3-year average percentage of correct answers was 86.5\% (830/960) for GPT-4 and 60.7\% (583/960) for GPT-3.5. GPT model accuracy was highest in Biopharmacy (GPT-3.5 77/96, 80.2\% in 2022; GPT-4 87/90, 96.7\% in 2021) and lowest in Medical Health Legislation (GPT-3.5 8/20, 40\% in 2022; GPT-4 12/20, 60\% in 2022). Additionally, when comparing the performance of artificial intelligence with that of human participants, pharmacy students outperformed GPT-3.5 but not GPT-4. Conclusions: In the last 3 years, GPT models have performed very close to or exceeded the passing threshold for the KPLE. This study demonstrates the potential of large language models in the pharmacy domain; however, extensive research is needed to evaluate their reliability and ensure their secure application in pharmacy contexts due to several inherent challenges. Addressing these limitations could make GPT models more effective auxiliary tools for pharmacy education. ", doi="10.2196/57451", url="https://mededu.jmir.org/2024/1/e57451" } @Article{info:doi/10.2196/63188, author="Luo, Yuan and Miao, Yiqun and Zhao, Yuhan and Li, Jiawei and Chen, Yuling and Yue, Yuexue and Wu, Ying", title="Comparing the Accuracy of Two Generated Large Language Models in Identifying Health-Related Rumors or Misconceptions and the Applicability in Health Science Popularization: Proof-of-Concept Study", journal="JMIR Form Res", year="2024", month="Dec", day="2", volume="8", pages="e63188", keywords="rumor", keywords="misconception", keywords="health science popularization", keywords="health education", keywords="large language model", keywords="LLM", keywords="applicability", keywords="accuracy", keywords="effectiveness", keywords="health related", keywords="education", keywords="health science", keywords="proof of concept", abstract="Background: Health-related rumors and misconceptions are spreading at an alarming rate, fueled by the rapid development of the internet and the exponential growth of social media platforms. This phenomenon has become a pressing global concern, as the dissemination of false information can have severe consequences, including widespread panic, social instability, and even public health crises. Objective: The aim of the study is to compare the accuracy of rumor identification and the effectiveness of health science popularization between 2 generated large language models in Chinese (GPT-4 by OpenAI and Enhanced Representation through Knowledge Integration Bot [ERNIE Bot] 4.0 by Baidu). Methods: In total, 20 health rumors and misconceptions, along with 10 health truths, were randomly inputted into GPT-4 and ERNIE Bot 4.0. We prompted them to determine whether the statements were rumors or misconceptions and provide explanations for their judgment. Further, we asked them to generate a health science popularization essay. We evaluated the outcomes in terms of accuracy, effectiveness, readability, and applicability. Accuracy was assessed by the rate of correctly identifying health-related rumors, misconceptions, and truths. Effectiveness was determined by the accuracy of the generated explanation, which was assessed collaboratively by 2 research team members with a PhD in nursing. Readability was calculated by the readability formula of Chinese health education materials. Applicability was evaluated by the Chinese Suitability Assessment of Materials. Results: GPT-4 and ERNIE Bot 4.0 correctly identified all health rumors and misconceptions (100\% accuracy rate). For truths, the accuracy rate was 70\% (7/10) and 100\% (10/10), respectively. Both mostly provided widely recognized viewpoints without obvious errors. The average readability score for the health essays was 2.92 (SD 0.85) for GPT-4 and 3.02 (SD 0.84) for ERNIE Bot 4.0 (P=.65). For applicability, except for the content and cultural appropriateness category, significant differences were observed in the total score and scores in other dimensions between them (P<.05). Conclusions: ERNIE Bot 4.0 demonstrated similar accuracy to GPT-4 in identifying Chinese rumors. Both provided widely accepted views, despite some inaccuracies. These insights enhance understanding and correct misunderstandings. For health essays, educators can learn from readable language styles of GLLMs. Finally, ERNIE Bot 4.0 aligns with Chinese expression habits, making it a good choice for a better Chinese reading experience. ", doi="10.2196/63188", url="https://formative.jmir.org/2024/1/e63188" } @Article{info:doi/10.2196/51433, author="Ehrett, Carl and Hegde, Sudeep and Andre, Kwame and Liu, Dixizi and Wilson, Timothy", title="Leveraging Open-Source Large Language Models for Data Augmentation in Hospital Staff Surveys: Mixed Methods Study", journal="JMIR Med Educ", year="2024", month="Nov", day="19", volume="10", pages="e51433", keywords="data augmentation", keywords="large language models", keywords="medical education", keywords="natural language processing", keywords="data security", keywords="ethics", keywords="AI", keywords="artificial intelligence", keywords="data privacy", keywords="medical staff", abstract="Background: Generative large language models (LLMs) have the potential to revolutionize medical education by generating tailored learning materials, enhancing teaching efficiency, and improving learner engagement. However, the application of LLMs in health care settings, particularly for augmenting small datasets in text classification tasks, remains underexplored, particularly for cost- and privacy-conscious applications that do not permit the use of third-party services such as OpenAI's ChatGPT. Objective: This study aims to explore the use of open-source LLMs, such as Large Language Model Meta AI (LLaMA) and Alpaca models, for data augmentation in a specific text classification task related to hospital staff surveys. Methods: The surveys were designed to elicit narratives of everyday adaptation by frontline radiology staff during the initial phase of the COVID-19 pandemic. A 2-step process of data augmentation and text classification was conducted. The study generated synthetic data similar to the survey reports using 4 generative LLMs for data augmentation. A different set of 3 classifier LLMs was then used to classify the augmented text for thematic categories. The study evaluated performance on the classification task. Results: The overall best-performing combination of LLMs, temperature, classifier, and number of synthetic data cases is via augmentation with LLaMA 7B at temperature 0.7 with 100 augments, using Robustly Optimized BERT Pretraining Approach (RoBERTa) for the classification task, achieving an average area under the receiver operating characteristic (AUC) curve of 0.87 (SD 0.02; ie, 1 SD). The results demonstrate that open-source LLMs can enhance text classifiers' performance for small datasets in health care contexts, providing promising pathways for improving medical education processes and patient care practices. Conclusions: The study demonstrates the value of data augmentation with open-source LLMs, highlights the importance of privacy and ethical considerations when using LLMs, and suggests future directions for research in this field. ", doi="10.2196/51433", url="https://mededu.jmir.org/2024/1/e51433" } @Article{info:doi/10.2196/54297, author="Zhou, You and Li, Si-Jia and Tang, Xing-Yi and He, Yi-Chen and Ma, Hao-Ming and Wang, Ao-Qi and Pei, Run-Yuan and Piao, Mei-Hua", title="Using ChatGPT in Nursing: Scoping Review of Current Opinions", journal="JMIR Med Educ", year="2024", month="Nov", day="19", volume="10", pages="e54297", keywords="ChatGPT", keywords="large language model", keywords="nursing", keywords="artificial intelligence", keywords="scoping review", keywords="generative AI", keywords="nursing education", abstract="Background: Since the release of ChatGPT in November 2022, this emerging technology has garnered a lot of attention in various fields, and nursing is no exception. However, to date, no study has comprehensively summarized the status and opinions of using ChatGPT across different nursing fields. Objective: We aim to synthesize the status and opinions of using ChatGPT according to different nursing fields, as well as assess ChatGPT's strengths, weaknesses, and the potential impacts it may cause. Methods: This scoping review was conducted following the framework of Arksey and O'Malley and guided by the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews). A comprehensive literature research was conducted in 4 web-based databases (PubMed, Embase, Web of Science, and CINHAL) to identify studies reporting the opinions of using ChatGPT in nursing fields from 2022 to September 3, 2023. The references of the included studies were screened manually to further identify relevant studies. Two authors conducted studies screening, eligibility assessments, and data extraction independently. Results: A total of 30 studies were included. The United States (7 studies), Canada (5 studies), and China (4 studies) were countries with the most publications. In terms of fields of concern, studies mainly focused on ``ChatGPT and nursing education'' (20 studies), ``ChatGPT and nursing practice'' (10 studies), and ``ChatGPT and nursing research, writing, and examination'' (6 studies). Six studies addressed the use of ChatGPT in multiple nursing fields. Conclusions: As an emerging artificial intelligence technology, ChatGPT has great potential to revolutionize nursing education, nursing practice, and nursing research. However, researchers, institutions, and administrations still need to critically examine its accuracy, safety, and privacy, as well as academic misconduct and potential ethical issues that it may lead to before applying ChatGPT to practice. ", doi="10.2196/54297", url="https://mededu.jmir.org/2024/1/e54297" } @Article{info:doi/10.2196/56762, author="Ros-Arlanz{\'o}n, Pablo and Perez-Sempere, Angel", title="Evaluating AI Competence in Specialized Medicine: Comparative Analysis of ChatGPT and Neurologists in a Neurology Specialist Examination in Spain", journal="JMIR Med Educ", year="2024", month="Nov", day="14", volume="10", pages="e56762", keywords="artificial intelligence", keywords="ChatGPT", keywords="clinical decision-making", keywords="medical education", keywords="medical knowledge assessment", keywords="OpenAI", abstract="Background: With the rapid advancement of artificial intelligence (AI) in various fields, evaluating its application in specialized medical contexts becomes crucial. ChatGPT, a large language model developed by OpenAI, has shown potential in diverse applications, including medicine. Objective: This study aims to compare the performance of ChatGPT with that of attending neurologists in a real neurology specialist examination conducted in the Valencian Community, Spain, assessing the AI's capabilities and limitations in medical knowledge. Methods: We conducted a comparative analysis using the 2022 neurology specialist examination results from 120 neurologists and responses generated by ChatGPT versions 3.5 and 4. The examination consisted of 80 multiple-choice questions, with a focus on clinical neurology and health legislation. Questions were classified according to Bloom's Taxonomy. Statistical analysis of performance, including the $\kappa$ coefficient for response consistency, was performed. Results: Human participants exhibited a median score of 5.91 (IQR: 4.93-6.76), with 32 neurologists failing to pass. ChatGPT-3.5 ranked 116th out of 122, answering 54.5\% of questions correctly (score 3.94). ChatGPT-4 showed marked improvement, ranking 17th with 81.8\% of correct answers (score 7.57), surpassing several human specialists. No significant variations were observed in the performance on lower-order questions versus higher-order questions. Additionally, ChatGPT-4 demonstrated increased interrater reliability, as reflected by a higher $\kappa$ coefficient of 0.73, compared to ChatGPT-3.5's coefficient of 0.69. Conclusions: This study underscores the evolving capabilities of AI in medical knowledge assessment, particularly in specialized fields. ChatGPT-4's performance, outperforming the median score of human participants in a rigorous neurology examination, represents a significant milestone in AI development, suggesting its potential as an effective tool in specialized medical education and assessment. ", doi="10.2196/56762", url="https://mededu.jmir.org/2024/1/e56762" } @Article{info:doi/10.2196/60226, author="Ming, Shuai and Yao, Xi and Guo, Xiaohong and Guo, Qingge and Xie, Kunpeng and Chen, Dandan and Lei, Bo", title="Performance of ChatGPT in Ophthalmic Registration and Clinical Diagnosis: Cross-Sectional Study", journal="J Med Internet Res", year="2024", month="Nov", day="14", volume="26", pages="e60226", keywords="artificial intelligence", keywords="chatbot", keywords="ChatGPT", keywords="ophthalmic registration", keywords="clinical diagnosis", keywords="AI", keywords="cross-sectional study", keywords="eye disease", keywords="eye disorder", keywords="ophthalmology", keywords="health care", keywords="outpatient registration", keywords="clinical", keywords="decision-making", keywords="generative AI", keywords="vision impairment", abstract="Background: Artificial intelligence (AI) chatbots such as ChatGPT are expected to impact vision health care significantly. Their potential to optimize the consultation process and diagnostic capabilities across range of ophthalmic subspecialties have yet to be fully explored. Objective: This study aims to investigate the performance of AI chatbots in recommending ophthalmic outpatient registration and diagnosing eye diseases within clinical case profiles. Methods: This cross-sectional study used clinical cases from Chinese Standardized Resident Training--Ophthalmology (2nd Edition). For each case, 2 profiles were created: patient with history (Hx) and patient with history and examination (Hx+Ex). These profiles served as independent queries for GPT-3.5 and GPT-4.0 (accessed from March 5 to 18, 2024). Similarly, 3 ophthalmic residents were posed the same profiles in a questionnaire format. The accuracy of recommending ophthalmic subspecialty registration was primarily evaluated using Hx profiles. The accuracy of the top-ranked diagnosis and the accuracy of the diagnosis within the top 3 suggestions (do-not-miss diagnosis) were assessed using Hx+Ex profiles. The gold standard for judgment was the published, official diagnosis. Characteristics of incorrect diagnoses by ChatGPT were also analyzed. Results: A total of 208 clinical profiles from 12 ophthalmic subspecialties were analyzed (104 Hx and 104 Hx+Ex profiles). For Hx profiles, GPT-3.5, GPT-4.0, and residents showed comparable accuracy in registration suggestions (66/104, 63.5\%; 81/104, 77.9\%; and 72/104, 69.2\%, respectively; P=.07), with ocular trauma, retinal diseases, and strabismus and amblyopia achieving the top 3 accuracies. For Hx+Ex profiles, both GPT-4.0 and residents demonstrated higher diagnostic accuracy than GPT-3.5 (62/104, 59.6\% and 63/104, 60.6\% vs 41/104, 39.4\%; P=.003 and P=.001, respectively). Accuracy for do-not-miss diagnoses also improved (79/104, 76\% and 68/104, 65.4\% vs 51/104, 49\%; P<.001 and P=.02, respectively). The highest diagnostic accuracies were observed in glaucoma; lens diseases; and eyelid, lacrimal, and orbital diseases. GPT-4.0 recorded fewer incorrect top-3 diagnoses (25/42, 60\% vs 53/63, 84\%; P=.005) and more partially correct diagnoses (21/42, 50\% vs 7/63 11\%; P<.001) than GPT-3.5, while GPT-3.5 had more completely incorrect (27/63, 43\% vs 7/42, 17\%; P=.005) and less precise diagnoses (22/63, 35\% vs 5/42, 12\%; P=.009). Conclusions: GPT-3.5 and GPT-4.0 showed intermediate performance in recommending ophthalmic subspecialties for registration. While GPT-3.5 underperformed, GPT-4.0 approached and numerically surpassed residents in differential diagnosis. AI chatbots show promise in facilitating ophthalmic patient registration. However, their integration into diagnostic decision-making requires more validation. ", doi="10.2196/60226", url="https://www.jmir.org/2024/1/e60226" } @Article{info:doi/10.2196/63430, author="Bicknell, T. Brenton and Butler, Danner and Whalen, Sydney and Ricks, James and Dixon, J. Cory and Clark, B. Abigail and Spaedy, Olivia and Skelton, Adam and Edupuganti, Neel and Dzubinski, Lance and Tate, Hudson and Dyess, Garrett and Lindeman, Brenessa and Lehmann, Soleymani Lisa", title="ChatGPT-4 Omni Performance in USMLE Disciplines and Clinical Skills: Comparative Analysis", journal="JMIR Med Educ", year="2024", month="Nov", day="6", volume="10", pages="e63430", keywords="large language model", keywords="ChatGPT", keywords="medical education", keywords="USMLE", keywords="AI in medical education", keywords="medical student resources", keywords="educational technology", keywords="artificial intelligence in medicine", keywords="clinical skills", keywords="LLM", keywords="medical licensing examination", keywords="medical students", keywords="United States Medical Licensing Examination", keywords="ChatGPT 4 Omni", keywords="ChatGPT 4", keywords="ChatGPT 3.5", abstract="Background: Recent studies, including those by the National Board of Medical Examiners, have highlighted the remarkable capabilities of recent large language models (LLMs) such as ChatGPT in passing the United States Medical Licensing Examination (USMLE). However, there is a gap in detailed analysis of LLM performance in specific medical content areas, thus limiting an assessment of their potential utility in medical education. Objective: This study aimed to assess and compare the accuracy of successive ChatGPT versions (GPT-3.5, GPT-4, and GPT-4 Omni) in USMLE disciplines, clinical clerkships, and the clinical skills of diagnostics and management. Methods: This study used 750 clinical vignette-based multiple-choice questions to characterize the performance of successive ChatGPT versions (ChatGPT 3.5 [GPT-3.5], ChatGPT 4 [GPT-4], and ChatGPT 4 Omni [GPT-4o]) across USMLE disciplines, clinical clerkships, and in clinical skills (diagnostics and management). Accuracy was assessed using a standardized protocol, with statistical analyses conducted to compare the models' performances. Results: GPT-4o achieved the highest accuracy across 750 multiple-choice questions at 90.4\%, outperforming GPT-4 and GPT-3.5, which scored 81.1\% and 60.0\%, respectively. GPT-4o's highest performances were in social sciences (95.5\%), behavioral and neuroscience (94.2\%), and pharmacology (93.2\%). In clinical skills, GPT-4o's diagnostic accuracy was 92.7\% and management accuracy was 88.8\%, significantly higher than its predecessors. Notably, both GPT-4o and GPT-4 significantly outperformed the medical student average accuracy of 59.3\% (95\% CI 58.3?60.3). Conclusions: GPT-4o's performance in USMLE disciplines, clinical clerkships, and clinical skills indicates substantial improvements over its predecessors, suggesting significant potential for the use of this technology as an educational aid for medical students. These findings underscore the need for careful consideration when integrating LLMs into medical education, emphasizing the importance of structured curricula to guide their appropriate use and the need for ongoing critical analyses to ensure their reliability and effectiveness. ", doi="10.2196/63430", url="https://mededu.jmir.org/2024/1/e63430" } @Article{info:doi/10.2196/51446, author="Alli, Rabia Sauliha and Hossain, Qahh?r Soaad and Das, Sunit and Upshur, Ross", title="The Potential of Artificial Intelligence Tools for Reducing Uncertainty in Medicine and Directions for Medical Education", journal="JMIR Med Educ", year="2024", month="Nov", day="4", volume="10", pages="e51446", keywords="artificial intelligence", keywords="machine learning", keywords="uncertainty", keywords="clinical decision-making", keywords="medical education", keywords="generative AI", keywords="generative artificial intelligence", doi="10.2196/51446", url="https://mededu.jmir.org/2024/1/e51446" } @Article{info:doi/10.2196/57132, author="Tao, Wenjuan and Yang, Jinming and Qu, Xing", title="Utilization of, Perceptions on, and Intention to Use AI Chatbots Among Medical Students in China: National Cross-Sectional Study", journal="JMIR Med Educ", year="2024", month="Oct", day="28", volume="10", pages="e57132", keywords="medical education", keywords="artificial intelligence", keywords="UTAUT model", keywords="utilization", keywords="medical students", keywords="cross-sectional study", keywords="AI chatbots", keywords="China", keywords="acceptance", keywords="electronic survey", keywords="social media", keywords="medical information", keywords="risk", keywords="training", keywords="support", abstract="Background: Artificial intelligence (AI) chatbots are poised to have a profound impact on medical education. Medical students, as early adopters of technology and future health care providers, play a crucial role in shaping the future of health care. However, little is known about the utilization of, perceptions on, and intention to use AI chatbots among medical students in China. Objective: This study aims to explore the utilization of, perceptions on, and intention to use generative AI chatbots among medical students in China, using the Unified Theory of Acceptance and Use of Technology (UTAUT) framework. By conducting a national cross-sectional survey, we sought to identify the key determinants that influence medical students' acceptance of AI chatbots, thereby providing a basis for enhancing their integration into medical education. Understanding these factors is crucial for educators, policy makers, and technology developers to design and implement effective AI-driven educational tools that align with the needs and expectations of future health care professionals. Methods: A web-based electronic survey questionnaire was developed and distributed via social media to medical students across the country. The UTAUT was used as a theoretical framework to design the questionnaire and analyze the data. The relationship between behavioral intention to use AI chatbots and UTAUT predictors was examined using multivariable regression. Results: A total of 693 participants were from 57 universities covering 21 provinces or municipalities in China. Only a minority (199/693, 28.72\%) reported using AI chatbots for studying, with ChatGPT (129/693, 18.61\%) being the most commonly used. Most of the participants used AI chatbots for quickly obtaining medical information and knowledge (631/693, 91.05\%) and increasing learning efficiency (594/693, 85.71\%). Utilization behavior, social influence, facilitating conditions, perceived risk, and personal innovativeness showed significant positive associations with the behavioral intention to use AI chatbots (all P values were <.05). Conclusions: Chinese medical students hold positive perceptions toward and high intentions to use AI chatbots, but there are gaps between intention and actual adoption. This highlights the need for strategies to improve access, training, and support and provide peer usage examples to fully harness the potential benefits of chatbot technology. ", doi="10.2196/57132", url="https://mededu.jmir.org/2024/1/e57132" } @Article{info:doi/10.2196/51411, author="Wang, Shuang and Yang, Liuying and Li, Min and Zhang, Xinghe and Tai, Xiantao", title="Medical Education and Artificial Intelligence: Web of Science--Based Bibliometric Analysis (2013-2022)", journal="JMIR Med Educ", year="2024", month="Oct", day="10", volume="10", pages="e51411", keywords="artificial intelligence", keywords="medical education", keywords="bibliometric analysis", keywords="CiteSpace", keywords="VOSviewer", abstract="Background: Incremental advancements in artificial intelligence (AI) technology have facilitated its integration into various disciplines. In particular, the infusion of AI into medical education has emerged as a significant trend, with noteworthy research findings. Consequently, a comprehensive review and analysis of the current research landscape of AI in medical education is warranted. Objective: This study aims to conduct a bibliometric analysis of pertinent papers, spanning the years 2013?2022, using CiteSpace and VOSviewer. The study visually represents the existing research status and trends of AI in medical education. Methods: Articles related to AI and medical education, published between 2013 and 2022, were systematically searched in the Web of Science core database. Two reviewers scrutinized the initially retrieved papers, based on their titles and abstracts, to eliminate papers unrelated to the topic. The selected papers were then analyzed and visualized for country, institution, author, reference, and keywords using CiteSpace and VOSviewer. Results: A total of 195 papers pertaining to AI in medical education were identified from 2013 to 2022. The annual publications demonstrated an increasing trend over time. The United States emerged as the most active country in this research arena, and Harvard Medical School and the University of Toronto were the most active institutions. Prolific authors in this field included Vincent Bissonnette, Charlotte Blacketer, Rolando F Del Maestro, Nicole Ledows, Nykan Mirchi, Alexander Winkler-Schwartz, and Recai Yilamaz. The paper with the highest citation was ``Medical Students' Attitude Towards Artificial Intelligence: A Multicentre Survey.'' Keyword analysis revealed that ``radiology,'' ``medical physics,'' ``ehealth,'' ``surgery,'' and ``specialty'' were the primary focus, whereas ``big data'' and ``management'' emerged as research frontiers. Conclusions: The study underscores the promising potential of AI in medical education research. Current research directions encompass radiology, medical information management, and other aspects. Technological progress is expected to broaden these directions further. There is an urgent need to bolster interregional collaboration and enhance research quality. These findings offer valuable insights for researchers to identify perspectives and guide future research directions. ", doi="10.2196/51411", url="https://mededu.jmir.org/2024/1/e51411" } @Article{info:doi/10.2196/57157, author="Miao, Jing and Thongprayoon, Charat and Garcia Valencia, Oscar and Craici, M. Iasmina and Cheungpasitporn, Wisit", title="Navigating Nephrology's Decline Through a GPT-4 Analysis of Internal Medicine Specialties in the United States: Qualitative Study", journal="JMIR Med Educ", year="2024", month="Oct", day="10", volume="10", pages="e57157", keywords="artificial intelligence", keywords="ChatGPT", keywords="nephrology fellowship training", keywords="fellowship matching", keywords="medical education", keywords="AI", keywords="nephrology", keywords="fellowship", keywords="United States", keywords="factor", keywords="chatbots", keywords="intellectual", keywords="complexity", keywords="work-life balance", keywords="procedural involvement", keywords="opportunity", keywords="career demand", keywords="financial compensation", abstract="Background: The 2024 Nephrology fellowship match data show the declining interest in nephrology in the United States, with an 11\% drop in candidates and a mere 66\% (321/488) of positions filled. Objective: The study aims to discern the factors influencing this trend using ChatGPT, a leading chatbot model, for insights into the comparative appeal of nephrology versus other internal medicine specialties. Methods: Using the GPT-4 model, the study compared nephrology with 13 other internal medicine specialties, evaluating each on 7 criteria including intellectual complexity, work-life balance, procedural involvement, research opportunities, patient relationships, career demand, and financial compensation. Each criterion was assigned scores from 1 to 10, with the cumulative score determining the ranking. The approach included counteracting potential bias by instructing GPT-4 to favor other specialties over nephrology in reverse scenarios. Results: GPT-4 ranked nephrology only above sleep medicine. While nephrology scored higher than hospice and palliative medicine, it fell short in key criteria such as work-life balance, patient relationships, and career demand. When examining the percentage of filled positions in the 2024 appointment year match, nephrology's filled rate was 66\%, only higher than the 45\% (155/348) filled rate of geriatric medicine. Nephrology's score decreased by 4\%?14\% in 5 criteria including intellectual challenge and complexity, procedural involvement, career opportunity and demand, research and academic opportunities, and financial compensation. Conclusions: ChatGPT does not favor nephrology over most internal medicine specialties, highlighting its diminishing appeal as a career choice. This trend raises significant concerns, especially considering the overall physician shortage, and prompts a reevaluation of factors affecting specialty choice among medical residents. ", doi="10.2196/57157", url="https://mededu.jmir.org/2024/1/e57157" } @Article{info:doi/10.2196/56128, author="Goodings, James Anthony and Kajitani, Sten and Chhor, Allison and Albakri, Ahmad and Pastrak, Mila and Kodancha, Megha and Ives, Rowan and Lee, Bin Yoo and Kajitani, Kari", title="Assessment of ChatGPT-4 in Family Medicine Board Examinations Using Advanced AI Learning and Analytical Methods: Observational Study", journal="JMIR Med Educ", year="2024", month="Oct", day="8", volume="10", pages="e56128", keywords="ChatGPT-4", keywords="Family Medicine Board Examination", keywords="artificial intelligence in medical education", keywords="AI performance assessment", keywords="prompt engineering", keywords="ChatGPT", keywords="artificial intelligence", keywords="AI", keywords="medical education", keywords="assessment", keywords="observational", keywords="analytical method", keywords="data analysis", keywords="examination", abstract="Background: This research explores the capabilities of ChatGPT-4 in passing the American Board of Family Medicine (ABFM) Certification Examination. Addressing a gap in existing literature, where earlier artificial intelligence (AI) models showed limitations in medical board examinations, this study evaluates the enhanced features and potential of ChatGPT-4, especially in document analysis and information synthesis. Objective: The primary goal is to assess whether ChatGPT-4, when provided with extensive preparation resources and when using sophisticated data analysis, can achieve a score equal to or above the passing threshold for the Family Medicine Board Examinations. Methods: In this study, ChatGPT-4 was embedded in a specialized subenvironment, ``AI Family Medicine Board Exam Taker,'' designed to closely mimic the conditions of the ABFM Certification Examination. This subenvironment enabled the AI to access and analyze a range of relevant study materials, including a primary medical textbook and supplementary web-based resources. The AI was presented with a series of ABFM-type examination questions, reflecting the breadth and complexity typical of the examination. Emphasis was placed on assessing the AI's ability to interpret and respond to these questions accurately, leveraging its advanced data processing and analysis capabilities within this controlled subenvironment. Results: In our study, ChatGPT-4's performance was quantitatively assessed on 300 practice ABFM examination questions. The AI achieved a correct response rate of 88.67\% (95\% CI 85.08\%-92.25\%) for the Custom Robot version and 87.33\% (95\% CI 83.57\%-91.10\%) for the Regular version. Statistical analysis, including the McNemar test (P=.45), indicated no significant difference in accuracy between the 2 versions. In addition, the chi-square test for error-type distribution (P=.32) revealed no significant variation in the pattern of errors across versions. These results highlight ChatGPT-4's capacity for high-level performance and consistency in responding to complex medical examination questions under controlled conditions. Conclusions: The study demonstrates that ChatGPT-4, particularly when equipped with specialized preparation and when operating in a tailored subenvironment, shows promising potential in handling the intricacies of medical board examinations. While its performance is comparable with the expected standards for passing the ABFM Certification Examination, further enhancements in AI technology and tailored training methods could push these capabilities to new heights. This exploration opens avenues for integrating AI tools such as ChatGPT-4 in medical education and assessment, emphasizing the importance of continuous advancement and specialized training in medical applications of AI. ", doi="10.2196/56128", url="https://mededu.jmir.org/2024/1/e56128" } @Article{info:doi/10.2196/51383, author="Choi, K. Yong and Lin, Shih-Yin and Fick, Marie Donna and Shulman, W. Richard and Lee, Sangil and Shrestha, Priyanka and Santoso, Kate", title="Optimizing ChatGPT's Interpretation and Reporting of Delirium Assessment Outcomes: Exploratory Study", journal="JMIR Form Res", year="2024", month="Oct", day="1", volume="8", pages="e51383", keywords="generative artificial intelligence", keywords="generative AI", keywords="large language models", keywords="ChatGPT", keywords="delirium detection", keywords="Sour Seven Questionnaire", keywords="prompt engineering", keywords="clinical vignettes", keywords="medical education", keywords="caregiver education", abstract="Background: Generative artificial intelligence (AI) and large language models, such as OpenAI's ChatGPT, have shown promising potential in supporting medical education and clinical decision-making, given their vast knowledge base and natural language processing capabilities. As a general purpose AI system, ChatGPT can complete a wide range of tasks, including differential diagnosis without additional training. However, the specific application of ChatGPT in learning and applying a series of specialized, context-specific tasks mimicking the workflow of a human assessor, such as administering a standardized assessment questionnaire, followed by inputting assessment results in a standardized form, and interpretating assessment results strictly following credible, published scoring criteria, have not been thoroughly studied. Objective: This exploratory study aims to evaluate and optimize ChatGPT's capabilities in administering and interpreting the Sour Seven Questionnaire, an informant-based delirium assessment tool. Specifically, the objectives were to train ChatGPT-3.5 and ChatGPT-4 to understand and correctly apply the Sour Seven Questionnaire to clinical vignettes using prompt engineering, assess the performance of these AI models in identifying and scoring delirium symptoms against scores from human experts, and refine and enhance the models' interpretation and reporting accuracy through iterative prompt optimization. Methods: We used prompt engineering to train ChatGPT-3.5 and ChatGPT-4 models on the Sour Seven Questionnaire, a tool for assessing delirium through caregiver input. Prompt engineering is a methodology used to enhance the AI's processing of inputs by meticulously structuring the prompts to improve accuracy and consistency in outputs. In this study, prompt engineering involved creating specific, structured commands that guided the AI models in understanding and applying the assessment tool's criteria accurately to clinical vignettes. This approach also included designing prompts to explicitly instruct the AI on how to format its responses, ensuring they were consistent with clinical documentation standards. Results: Both ChatGPT models demonstrated promising proficiency in applying the Sour Seven Questionnaire to the vignettes, despite initial inconsistencies and errors. Performance notably improved through iterative prompt engineering, enhancing the models' capacity to detect delirium symptoms and assign scores. Prompt optimizations included adjusting the scoring methodology to accept only definitive ``Yes'' or ``No'' responses, revising the evaluation prompt to mandate responses in a tabular format, and guiding the models to adhere to the 2 recommended actions specified in the Sour Seven Questionnaire. Conclusions: Our findings provide preliminary evidence supporting the potential utility of AI models such as ChatGPT in administering standardized clinical assessment tools. The results highlight the significance of context-specific training and prompt engineering in harnessing the full potential of these AI models for health care applications. Despite the encouraging results, broader generalizability and further validation in real-world settings warrant additional research. ", doi="10.2196/51383", url="https://formative.jmir.org/2024/1/e51383" } @Article{info:doi/10.2196/52346, author="Claman, Daniel and Sezgin, Emre", title="Artificial Intelligence in Dental Education: Opportunities and Challenges of Large Language Models and Multimodal Foundation Models", journal="JMIR Med Educ", year="2024", month="Sep", day="27", volume="10", pages="e52346", keywords="artificial intelligence", keywords="large language models", keywords="dental education", keywords="GPT", keywords="ChatGPT", keywords="periodontal health", keywords="AI", keywords="LLM", keywords="LLMs", keywords="chatbot", keywords="natural language", keywords="generative pretrained transformer", keywords="innovation", keywords="technology", keywords="large language model", doi="10.2196/52346", url="https://mededu.jmir.org/2024/1/e52346" } @Article{info:doi/10.2196/58753, author="Yamamoto, Akira and Koda, Masahide and Ogawa, Hiroko and Miyoshi, Tomoko and Maeda, Yoshinobu and Otsuka, Fumio and Ino, Hideo", title="Enhancing Medical Interview Skills Through AI-Simulated Patient Interactions: Nonrandomized Controlled Trial", journal="JMIR Med Educ", year="2024", month="Sep", day="23", volume="10", pages="e58753", keywords="medical interview", keywords="generative pretrained transformer", keywords="large language model", keywords="simulation-based learning", keywords="OSCE", keywords="artificial intelligence", keywords="medical education", keywords="simulated patients", keywords="nonrandomized controlled trial", abstract="Background: Medical interviewing is a critical skill in clinical practice, yet opportunities for practical training are limited in Japanese medical schools, necessitating urgent measures. Given advancements in artificial intelligence (AI) technology, its application in the medical field is expanding. However, reports on its application in medical interviews in medical education are scarce. Objective: This study aimed to investigate whether medical students' interview skills could be improved by engaging with AI-simulated patients using large language models, including the provision of feedback. Methods: This nonrandomized controlled trial was conducted with fourth-year medical students in Japan. A simulation program using large language models was provided to 35 students in the intervention group in 2023, while 110 students from 2022 who did not participate in the intervention were selected as the control group. The primary outcome was the score on the Pre-Clinical Clerkship Objective Structured Clinical Examination (pre-CC OSCE), a national standardized clinical skills examination, in medical interviewing. Secondary outcomes included surveys such as the Simulation-Based Training Quality Assurance Tool (SBT-QA10), administered at the start and end of the study. Results: The AI intervention group showed significantly higher scores on medical interviews than the control group (AI group vs control group: mean 28.1, SD 1.6 vs 27.1, SD 2.2; P=.01). There was a trend of inverse correlation between the SBT-QA10 and pre-CC OSCE scores (regression coefficient --2.0 to --2.1). No significant safety concerns were observed. Conclusions: Education through medical interviews using AI-simulated patients has demonstrated safety and a certain level of educational effectiveness. However, at present, the educational effects of this platform on nonverbal communication skills are limited, suggesting that it should be used as a supplementary tool to traditional simulation education. ", doi="10.2196/58753", url="https://mededu.jmir.org/2024/1/e58753", url="http://www.ncbi.nlm.nih.gov/pubmed/39312284" } @Article{info:doi/10.2196/56859, author="Yoon, Soo-Hyuk and Oh, Kyeong Seok and Lim, Gun Byung and Lee, Ho-Jin", title="Performance of ChatGPT in the In-Training Examination for Anesthesiology and Pain Medicine Residents in South Korea: Observational Study", journal="JMIR Med Educ", year="2024", month="Sep", day="16", volume="10", pages="e56859", keywords="AI tools", keywords="problem solving", keywords="anesthesiology", keywords="artificial intelligence", keywords="pain medicine", keywords="ChatGPT", keywords="health care", keywords="medical education", keywords="South Korea", abstract="Background: ChatGPT has been tested in health care, including the US Medical Licensing Examination and specialty exams, showing near-passing results. Its performance in the field of anesthesiology has been assessed using English board examination questions; however, its effectiveness in Korea remains unexplored. Objective: This study investigated the problem-solving performance of ChatGPT in the fields of anesthesiology and pain medicine in the Korean language context, highlighted advancements in artificial intelligence (AI), and explored its potential applications in medical education. Methods: We investigated the performance (number of correct answers/number of questions) of GPT-4, GPT-3.5, and CLOVA X in the fields of anesthesiology and pain medicine, using in-training examinations that have been administered to Korean anesthesiology residents over the past 5 years, with an annual composition of 100 questions. Questions containing images, diagrams, or photographs were excluded from the analysis. Furthermore, to assess the performance differences of the GPT across different languages, we conducted a comparative analysis of the GPT-4's problem-solving proficiency using both the original Korean texts and their English translations. Results: A total of 398 questions were analyzed. GPT-4 (67.8\%) demonstrated a significantly better overall performance than GPT-3.5 (37.2\%) and CLOVA-X (36.7\%). However, GPT-3.5 and CLOVA X did not show significant differences in their overall performance. Additionally, the GPT-4 showed superior performance on questions translated into English, indicating a language processing discrepancy (English: 75.4\% vs Korean: 67.8\%; difference 7.5\%; 95\% CI 3.1\%-11.9\%; P=.001). Conclusions: This study underscores the potential of AI tools, such as ChatGPT, in medical education and practice but emphasizes the need for cautious application and further refinement, especially in non-English medical contexts. The findings suggest that although AI advancements are promising, they require careful evaluation and development to ensure acceptable performance across diverse linguistic and professional settings. ", doi="10.2196/56859", url="https://mededu.jmir.org/2024/1/e56859" } @Article{info:doi/10.2196/59213, author="Holderried, Friederike and Stegemann-Philipps, Christian and Herrmann-Werner, Anne and Festl-Wietek, Teresa and Holderried, Martin and Eickhoff, Carsten and Mahling, Moritz", title="A Language Model--Powered Simulated Patient With Automated Feedback for History Taking: Prospective Study", journal="JMIR Med Educ", year="2024", month="Aug", day="16", volume="10", pages="e59213", keywords="virtual patients communication", keywords="communication skills", keywords="technology enhanced education", keywords="TEL", keywords="medical education", keywords="ChatGPT", keywords="GPT: LLM", keywords="LLMs", keywords="NLP", keywords="natural language processing", keywords="machine learning", keywords="artificial intelligence", keywords="language model", keywords="language models", keywords="communication", keywords="relationship", keywords="relationships", keywords="chatbot", keywords="chatbots", keywords="conversational agent", keywords="conversational agents", keywords="history", keywords="histories", keywords="simulated", keywords="student", keywords="students", keywords="interaction", keywords="interactions", abstract="Background: Although history taking is fundamental for diagnosing medical conditions, teaching and providing feedback on the skill can be challenging due to resource constraints. Virtual simulated patients and web-based chatbots have thus emerged as educational tools, with recent advancements in artificial intelligence (AI) such as large language models (LLMs) enhancing their realism and potential to provide feedback. Objective: In our study, we aimed to evaluate the effectiveness of a Generative Pretrained Transformer (GPT) 4 model to provide structured feedback on medical students' performance in history taking with a simulated patient. Methods: We conducted a prospective study involving medical students performing history taking with a GPT-powered chatbot. To that end, we designed a chatbot to simulate patients' responses and provide immediate feedback on the comprehensiveness of the students' history taking. Students' interactions with the chatbot were analyzed, and feedback from the chatbot was compared with feedback from a human rater. We measured interrater reliability and performed a descriptive analysis to assess the quality of feedback. Results: Most of the study's participants were in their third year of medical school. A total of 1894 question-answer pairs from 106 conversations were included in our analysis. GPT-4's role-play and responses were medically plausible in more than 99\% of cases. Interrater reliability between GPT-4 and the human rater showed ``almost perfect'' agreement (Cohen $\kappa$=0.832). Less agreement ($\kappa$<0.6) detected for 8 out of 45 feedback categories highlighted topics about which the model's assessments were overly specific or diverged from human judgement. Conclusions: The GPT model was effective in providing structured feedback on history-taking dialogs provided by medical students. Although we unraveled some limitations regarding the specificity of feedback for certain feedback categories, the overall high agreement with human raters suggests that LLMs can be a valuable tool for medical education. Our findings, thus, advocate the careful integration of AI-driven feedback mechanisms in medical training and highlight important aspects when LLMs are used in that context. ", doi="10.2196/59213", url="https://mededu.jmir.org/2024/1/e59213" } @Article{info:doi/10.2196/52784, author="Ming, Shuai and Guo, Qingge and Cheng, Wenjun and Lei, Bo", title="Influence of Model Evolution and System Roles on ChatGPT's Performance in Chinese Medical Licensing Exams: Comparative Study", journal="JMIR Med Educ", year="2024", month="Aug", day="13", volume="10", pages="e52784", keywords="ChatGPT", keywords="Chinese National Medical Licensing Examination", keywords="large language models", keywords="medical education", keywords="system role", keywords="LLM", keywords="LLMs", keywords="language model", keywords="language models", keywords="artificial intelligence", keywords="chatbot", keywords="chatbots", keywords="conversational agent", keywords="conversational agents", keywords="exam", keywords="exams", keywords="examination", keywords="examinations", keywords="OpenAI", keywords="answer", keywords="answers", keywords="response", keywords="responses", keywords="accuracy", keywords="performance", keywords="China", keywords="Chinese", abstract="Background: With the increasing application of large language models like ChatGPT in various industries, its potential in the medical domain, especially in standardized examinations, has become a focal point of research. Objective: The aim of this study is to assess the clinical performance of ChatGPT, focusing on its accuracy and reliability in the Chinese National Medical Licensing Examination (CNMLE). Methods: The CNMLE 2022 question set, consisting of 500 single-answer multiple choices questions, were reclassified into 15 medical subspecialties. Each question was tested 8 to 12 times in Chinese on the OpenAI platform from April 24 to May 15, 2023. Three key factors were considered: the version of GPT-3.5 and 4.0, the prompt's designation of system roles tailored to medical subspecialties, and repetition for coherence. A passing accuracy threshold was established as 60\%. The $\chi$2 tests and $\kappa$ values were employed to evaluate the model's accuracy and consistency. Results: GPT-4.0 achieved a passing accuracy of 72.7\%, which was significantly higher than that of GPT-3.5 (54\%; P<.001). The variability rate of repeated responses from GPT-4.0 was lower than that of GPT-3.5 (9\% vs 19.5\%; P<.001). However, both models showed relatively good response coherence, with $\kappa$ values of 0.778 and 0.610, respectively. System roles numerically increased accuracy for both GPT-4.0 (0.3\%?3.7\%) and GPT-3.5 (1.3\%?4.5\%), and reduced variability by 1.7\% and 1.8\%, respectively (P>.05). In subgroup analysis, ChatGPT achieved comparable accuracy among different question types (P>.05). GPT-4.0 surpassed the accuracy threshold in 14 of 15 subspecialties, while GPT-3.5 did so in 7 of 15 on the first response. Conclusions: GPT-4.0 passed the CNMLE and outperformed GPT-3.5 in key areas such as accuracy, consistency, and medical subspecialty expertise. Adding a system role insignificantly enhanced the model's reliability and answer coherence. GPT-4.0 showed promising potential in medical education and clinical practice, meriting further study. ", doi="10.2196/52784", url="https://mededu.jmir.org/2024/1/e52784" } @Article{info:doi/10.2196/51757, author="Cherrez-Ojeda, Ivan and Gallardo-Bastidas, C. Juan and Robles-Velasco, Karla and Osorio, F. Mar{\'i}a and Velez Leon, Maria Eleonor and Leon Velastegui, Manuel and Pauletto, Patr{\'i}cia and Aguilar-D{\'i}az, C. F. and Squassi, Aldo and Gonz{\'a}lez Eras, Patricia Susana and Cordero Carrasco, Erita and Chavez Gonzalez, Leonor Karol and Calderon, C. Juan and Bousquet, Jean and Bedbrook, Anna and Faytong-Haro, Marco", title="Understanding Health Care Students' Perceptions, Beliefs, and Attitudes Toward AI-Powered Language Models: Cross-Sectional Study", journal="JMIR Med Educ", year="2024", month="Aug", day="13", volume="10", pages="e51757", keywords="artificial intelligence", keywords="ChatGPT", keywords="education", keywords="health care", keywords="students", abstract="Background: ChatGPT was not intended for use in health care, but it has potential benefits that depend on end-user understanding and acceptability, which is where health care students become crucial. There is still a limited amount of research in this area. Objective: The primary aim of our study was to assess the frequency of ChatGPT use, the perceived level of knowledge, the perceived risks associated with its use, and the ethical issues, as well as attitudes toward the use of ChatGPT in the context of education in the field of health. In addition, we aimed to examine whether there were differences across groups based on demographic variables. The second part of the study aimed to assess the association between the frequency of use, the level of perceived knowledge, the level of risk perception, and the level of perception of ethics as predictive factors for participants' attitudes toward the use of ChatGPT. Methods: A cross-sectional survey was conducted from May to June 2023 encompassing students of medicine, nursing, dentistry, nutrition, and laboratory science across the Americas. The study used descriptive analysis, chi-square tests, and ANOVA to assess statistical significance across different categories. The study used several ordinal logistic regression models to analyze the impact of predictive factors (frequency of use, perception of knowledge, perception of risk, and ethics perception scores) on attitude as the dependent variable. The models were adjusted for gender, institution type, major, and country. Stata was used to conduct all the analyses. Results: Of 2661 health care students, 42.99\% (n=1144) were unaware of ChatGPT. The median score of knowledge was ``minimal'' (median 2.00, IQR 1.00-3.00). Most respondents (median 2.61, IQR 2.11-3.11) regarded ChatGPT as neither ethical nor unethical. Most participants (median 3.89, IQR 3.44-4.34) ``somewhat agreed'' that ChatGPT (1) benefits health care settings, (2) provides trustworthy data, (3) is a helpful tool for clinical and educational medical information access, and (4) makes the work easier. In total, 70\% (7/10) of people used it for homework. As the perceived knowledge of ChatGPT increased, there was a stronger tendency with regard to having a favorable attitude toward ChatGPT. Higher ethical consideration perception ratings increased the likelihood of considering ChatGPT as a source of trustworthy health care information (odds ratio [OR] 1.620, 95\% CI 1.498-1.752), beneficial in medical issues (OR 1.495, 95\% CI 1.452-1.539), and useful for medical literature (OR 1.494, 95\% CI 1.426-1.564; P<.001 for all results). Conclusions: Over 40\% of American health care students (1144/2661, 42.99\%) were unaware of ChatGPT despite its extensive use in the health field. Our data revealed the positive attitudes toward ChatGPT and the desire to learn more about it. Medical educators must explore how chatbots may be included in undergraduate health care education programs. ", doi="10.2196/51757", url="https://mededu.jmir.org/2024/1/e51757", url="http://www.ncbi.nlm.nih.gov/pubmed/39137029" } @Article{info:doi/10.2196/59133, author="Takahashi, Hiromizu and Shikino, Kiyoshi and Kondo, Takeshi and Komori, Akira and Yamada, Yuji and Saita, Mizue and Naito, Toshio", title="Educational Utility of Clinical Vignettes Generated in Japanese by ChatGPT-4: Mixed Methods Study", journal="JMIR Med Educ", year="2024", month="Aug", day="13", volume="10", pages="e59133", keywords="generative AI", keywords="ChatGPT-4", keywords="medical case generation", keywords="medical education", keywords="clinical vignettes", keywords="AI", keywords="artificial intelligence", keywords="Japanese", keywords="Japan", abstract="Background: Evaluating the accuracy and educational utility of artificial intelligence--generated medical cases, especially those produced by large language models such as ChatGPT-4 (developed by OpenAI), is crucial yet underexplored. Objective: This study aimed to assess the educational utility of ChatGPT-4--generated clinical vignettes and their applicability in educational settings. Methods: Using a convergent mixed methods design, a web-based survey was conducted from January 8 to 28, 2024, to evaluate 18 medical cases generated by ChatGPT-4 in Japanese. In the survey, 6 main question items were used to evaluate the quality of the generated clinical vignettes and their educational utility, which are information quality, information accuracy, educational usefulness, clinical match, terminology accuracy (TA), and diagnosis difficulty. Feedback was solicited from physicians specializing in general internal medicine or general medicine and experienced in medical education. Chi-square and Mann-Whitney U tests were performed to identify differences among cases, and linear regression was used to examine trends associated with physicians' experience. Thematic analysis of qualitative feedback was performed to identify areas for improvement and confirm the educational utility of the cases. Results: Of the 73 invited participants, 71 (97\%) responded. The respondents, primarily male (64/71, 90\%), spanned a broad range of practice years (from 1976 to 2017) and represented diverse hospital sizes throughout Japan. The majority deemed the information quality (mean 0.77, 95\% CI 0.75-0.79) and information accuracy (mean 0.68, 95\% CI 0.65-0.71) to be satisfactory, with these responses being based on binary data. The average scores assigned were 3.55 (95\% CI 3.49-3.60) for educational usefulness, 3.70 (95\% CI 3.65-3.75) for clinical match, 3.49 (95\% CI 3.44-3.55) for TA, and 2.34 (95\% CI 2.28-2.40) for diagnosis difficulty, based on a 5-point Likert scale. Statistical analysis showed significant variability in content quality and relevance across the cases (P<.001 after Bonferroni correction). Participants suggested improvements in generating physical findings, using natural language, and enhancing medical TA. The thematic analysis highlighted the need for clearer documentation, clinical information consistency, content relevance, and patient-centered case presentations. Conclusions: ChatGPT-4--generated medical cases written in Japanese possess considerable potential as resources in medical education, with recognized adequacy in quality and accuracy. Nevertheless, there is a notable need for enhancements in the precision and realism of case details. This study emphasizes ChatGPT-4's value as an adjunctive educational tool in the medical field, requiring expert oversight for optimal application. ", doi="10.2196/59133", url="https://mededu.jmir.org/2024/1/e59133", url="http://www.ncbi.nlm.nih.gov/pubmed/39137031" } @Article{info:doi/10.2196/51157, author="McBee, C. Joseph and Han, Y. Daniel and Liu, Li and Ma, Leah and Adjeroh, A. Donald and Xu, Dong and Hu, Gangqing", title="Assessing ChatGPT's Competency in Addressing Interdisciplinary Inquiries on Chatbot Uses in Sports Rehabilitation: Simulation Study", journal="JMIR Med Educ", year="2024", month="Aug", day="7", volume="10", pages="e51157", keywords="ChatGPT", keywords="chatbots", keywords="multirole-playing", keywords="interdisciplinary inquiry", keywords="medical education", keywords="sports medicine", abstract="Background: ChatGPT showcases exceptional conversational capabilities and extensive cross-disciplinary knowledge. In addition, it can perform multiple roles in a single chat session. This unique multirole-playing feature positions ChatGPT as a promising tool for exploring interdisciplinary subjects. Objective: The aim of this study was to evaluate ChatGPT's competency in addressing interdisciplinary inquiries based on a case study exploring the opportunities and challenges of chatbot uses in sports rehabilitation. Methods: We developed a model termed PanelGPT to assess ChatGPT's competency in addressing interdisciplinary topics through simulated panel discussions. Taking chatbot uses in sports rehabilitation as an example of an interdisciplinary topic, we prompted ChatGPT through PanelGPT to role-play a physiotherapist, psychologist, nutritionist, artificial intelligence expert, and athlete in a simulated panel discussion. During the simulation, we posed questions to the panel while ChatGPT acted as both the panelists for responses and the moderator for steering the discussion. We performed the simulation using ChatGPT-4 and evaluated the responses by referring to the literature and our human expertise. Results: By tackling questions related to chatbot uses in sports rehabilitation with respect to patient education, physiotherapy, physiology, nutrition, and ethical considerations, responses from the ChatGPT-simulated panel discussion reasonably pointed to various benefits such as 24/7 support, personalized advice, automated tracking, and reminders. ChatGPT also correctly emphasized the importance of patient education, and identified challenges such as limited interaction modes, inaccuracies in emotion-related advice, assurance of data privacy and security, transparency in data handling, and fairness in model training. It also stressed that chatbots are to assist as a copilot, not to replace human health care professionals in the rehabilitation process. Conclusions: ChatGPT exhibits strong competency in addressing interdisciplinary inquiry by simulating multiple experts from complementary backgrounds, with significant implications in assisting medical education. ", doi="10.2196/51157", url="https://mededu.jmir.org/2024/1/e51157", url="http://www.ncbi.nlm.nih.gov/pubmed/39042885" } @Article{info:doi/10.2196/54345, author="Aljamaan, Fadi and Temsah, Mohamad-Hani and Altamimi, Ibraheem and Al-Eyadhy, Ayman and Jamal, Amr and Alhasan, Khalid and Mesallam, A. Tamer and Farahat, Mohamed and Malki, H. Khalid", title="Reference Hallucination Score for Medical Artificial Intelligence Chatbots: Development and Usability Study", journal="JMIR Med Inform", year="2024", month="Jul", day="31", volume="12", pages="e54345", keywords="artificial intelligence (AI) chatbots", keywords="reference hallucination", keywords="bibliographic verification", keywords="ChatGPT", keywords="Perplexity", keywords="SciSpace", keywords="Elicit", keywords="Bing", abstract="Background: Artificial intelligence (AI) chatbots have recently gained use in medical practice by health care practitioners. Interestingly, the output of these AI chatbots was found to have varying degrees of hallucination in content and references. Such hallucinations generate doubts about their output and their implementation. Objective: The aim of our study was to propose a reference hallucination score (RHS) to evaluate the authenticity of AI chatbots' citations. Methods: Six AI chatbots were challenged with the same 10 medical prompts, requesting 10 references per prompt. The RHS is composed of 6 bibliographic items and the reference's relevance to prompts' keywords. RHS was calculated for each reference, prompt, and type of prompt (basic vs complex). The average RHS was calculated for each AI chatbot and compared across the different types of prompts and AI chatbots. Results: Bard failed to generate any references. ChatGPT 3.5 and Bing generated the highest RHS (score=11), while Elicit and SciSpace generated the lowest RHS (score=1), and Perplexity generated a middle RHS (score=7). The highest degree of hallucination was observed for reference relevancy to the prompt keywords (308/500, 61.6\%), while the lowest was for reference titles (169/500, 33.8\%). ChatGPT and Bing had comparable RHS ($\beta$ coefficient=--0.069; P=.32), while Perplexity had significantly lower RHS than ChatGPT ($\beta$ coefficient=--0.345; P<.001). AI chatbots generally had significantly higher RHS when prompted with scenarios or complex format prompts ($\beta$ coefficient=0.486; P<.001). Conclusions: The variation in RHS underscores the necessity for a robust reference evaluation tool to improve the authenticity of AI chatbots. Further, the variations highlight the importance of verifying their output and citations. Elicit and SciSpace had negligible hallucination, while ChatGPT and Bing had critical hallucination levels. The proposed AI chatbots' RHS could contribute to ongoing efforts to enhance AI's general reliability in medical research. ", doi="10.2196/54345", url="https://medinform.jmir.org/2024/1/e54345" } @Article{info:doi/10.2196/55933, author="Zhui, Li and Yhap, Nina and Liping, Liu and Zhengjie, Wang and Zhonghao, Xiong and Xiaoshu, Yuan and Hong, Cui and Xuexiu, Liu and Wei, Ren", title="Impact of Large Language Models on Medical Education and Teaching Adaptations", journal="JMIR Med Inform", year="2024", month="Jul", day="25", volume="12", pages="e55933", keywords="large language models", keywords="medical education", keywords="opportunities", keywords="challenges", keywords="critical thinking", keywords="educator", doi="10.2196/55933", url="https://medinform.jmir.org/2024/1/e55933" } @Article{info:doi/10.2196/56342, author="Burke, B. Harry and Hoang, Albert and Lopreiato, O. Joseph and King, Heidi and Hemmer, Paul and Montgomery, Michael and Gagarin, Viktoria", title="Assessing the Ability of a Large Language Model to Score Free-Text Medical Student Clinical Notes: Quantitative Study", journal="JMIR Med Educ", year="2024", month="Jul", day="25", volume="10", pages="e56342", keywords="medical education", keywords="generative artificial intelligence", keywords="natural language processing", keywords="ChatGPT", keywords="generative pretrained transformer", keywords="standardized patients", keywords="clinical notes", keywords="free-text notes", keywords="history and physical examination", keywords="large language model", keywords="LLM", keywords="medical student", keywords="medical students", keywords="clinical information", keywords="artificial intelligence", keywords="AI", keywords="patients", keywords="patient", keywords="medicine", abstract="Background: Teaching medical students the skills required to acquire, interpret, apply, and communicate clinical information is an integral part of medical education. A crucial aspect of this process involves providing students with feedback regarding the quality of their free-text clinical notes. Objective: The goal of this study was to assess the ability of ChatGPT 3.5, a large language model, to score medical students' free-text history and physical notes. Methods: This is a single-institution, retrospective study. Standardized patients learned a prespecified clinical case and, acting as the patient, interacted with medical students. Each student wrote a free-text history and physical note of their interaction. The students' notes were scored independently by the standardized patients and ChatGPT using a prespecified scoring rubric that consisted of 85 case elements. The measure of accuracy was percent correct. Results: The study population consisted of 168 first-year medical students. There was a total of 14,280 scores. The ChatGPT incorrect scoring rate was 1.0\%, and the standardized patient incorrect scoring rate was 7.2\%. The ChatGPT error rate was 86\%, lower than the standardized patient error rate. The ChatGPT mean incorrect scoring rate of 12 (SD 11) was significantly lower than the standardized patient mean incorrect scoring rate of 85 (SD 74; P=.002). Conclusions: ChatGPT demonstrated a significantly lower error rate compared to standardized patients. This is the first study to assess the ability of a generative pretrained transformer (GPT) program to score medical students' standardized patient-based free-text clinical notes. It is expected that, in the near future, large language models will provide real-time feedback to practicing physicians regarding their free-text notes. GPT artificial intelligence programs represent an important advance in medical education and medical practice. ", doi="10.2196/56342", url="https://mededu.jmir.org/2024/1/e56342" } @Article{info:doi/10.2196/52878, author="Noroozi, Mohammad and St John, Ace and Masino, Caterina and Laplante, Simon and Hunter, Jaryd and Brudno, Michael and Madani, Amin and Kersten-Oertel, Marta", title="Education in Laparoscopic Cholecystectomy: Design and Feasibility Study of the LapBot Safe Chole Mobile Game", journal="JMIR Form Res", year="2024", month="Jul", day="25", volume="8", pages="e52878", keywords="gamification", keywords="serious games", keywords="surgery", keywords="education", keywords="laparoscopic cholecystectomy", keywords="artificial intelligence", keywords="AI", keywords="laparoscope", keywords="gallbladder", keywords="cholecystectomy", keywords="mobile game", keywords="gamify", keywords="educational game", keywords="interactive", keywords="decision-making", keywords="mobile phone", abstract="Background: ?Major bile duct injuries during laparoscopic cholecystectomy (LC), often stemming from errors in surgical judgment and visual misperception of critical anatomy, significantly impact morbidity, mortality, disability, and health care costs. Objective: ?To enhance safe LC learning, we developed an educational mobile game, LapBot Safe Chole, which uses an artificial intelligence (AI) model to provide real-time coaching and feedback, improving intraoperative decision-making. Methods: ?LapBot Safe Chole offers a free, accessible simulated learning experience with real-time AI feedback. Players engage with intraoperative LC scenarios (short video clips) and identify ideal dissection zones. After the response, users receive an accuracy score from a validated AI algorithm. The game consists of 5 levels of increasing difficulty based on the Parkland grading scale for cholecystitis. Results: ?Beta testing (n=29) showed score improvements with each round, with attendings and senior trainees achieving top scores faster than junior residents. Learning curves and progression distinguished candidates, with a significant association between user level and scores (P=.003). Players found LapBot enjoyable and educational. Conclusions: ?LapBot Safe Chole effectively integrates safe LC principles into a fun, accessible, and educational game using AI-generated feedback. Initial beta testing supports the validity of the assessment scores and suggests high adoption and engagement potential among surgical trainees. ", doi="10.2196/52878", url="https://formative.jmir.org/2024/1/e52878" } @Article{info:doi/10.2196/52818, author="Cherif, Hela and Moussa, Chirine and Missaoui, Mouhaymen Abdel and Salouage, Issam and Mokaddem, Salma and Dhahri, Besma", title="Appraisal of ChatGPT's Aptitude for Medical Education: Comparative Analysis With Third-Year Medical Students in a Pulmonology Examination", journal="JMIR Med Educ", year="2024", month="Jul", day="23", volume="10", pages="e52818", keywords="medical education", keywords="ChatGPT", keywords="GPT", keywords="artificial intelligence", keywords="natural language processing", keywords="NLP", keywords="pulmonary medicine", keywords="pulmonary", keywords="lung", keywords="lungs", keywords="respiratory", keywords="respiration", keywords="pneumology", keywords="comparative analysis", keywords="large language models", keywords="LLMs", keywords="LLM", keywords="language model", keywords="generative AI", keywords="generative artificial intelligence", keywords="generative", keywords="exams", keywords="exam", keywords="examinations", keywords="examination", abstract="Background: The rapid evolution of ChatGPT has generated substantial interest and led to extensive discussions in both public and academic domains, particularly in the context of medical education. Objective: This study aimed to evaluate ChatGPT's performance in a pulmonology examination through a comparative analysis with that of third-year medical students. Methods: In this cross-sectional study, we conducted a comparative analysis with 2 distinct groups. The first group comprised 244 third-year medical students who had previously taken our institution's 2020 pulmonology examination, which was conducted in French. The second group involved ChatGPT-3.5 in 2 separate sets of conversations: without contextualization (V1) and with contextualization (V2). In both V1 and V2, ChatGPT received the same set of questions administered to the students. Results: V1 demonstrated exceptional proficiency in radiology, microbiology, and thoracic surgery, surpassing the majority of medical students in these domains. However, it faced challenges in pathology, pharmacology, and clinical pneumology. In contrast, V2 consistently delivered more accurate responses across various question categories, regardless of the specialization. ChatGPT exhibited suboptimal performance in multiple choice questions compared to medical students. V2 excelled in responding to structured open-ended questions. Both ChatGPT conversations, particularly V2, outperformed students in addressing questions of low and intermediate difficulty. Interestingly, students showcased enhanced proficiency when confronted with highly challenging questions. V1 fell short of passing the examination. Conversely, V2 successfully achieved examination success, outperforming 139 (62.1\%) medical students. Conclusions: While ChatGPT has access to a comprehensive web-based data set, its performance closely mirrors that of an average medical student. Outcomes are influenced by question format, item complexity, and contextual nuances. The model faces challenges in medical contexts requiring information synthesis, advanced analytical aptitude, and clinical judgment, as well as in non-English language assessments and when confronted with data outside mainstream internet sources. ", doi="10.2196/52818", url="https://mededu.jmir.org/2024/1/e52818" } @Article{info:doi/10.2196/56930, author="Laymouna, Moustafa and Ma, Yuanchao and Lessard, David and Schuster, Tibor and Engler, Kim and Lebouch{\'e}, Bertrand", title="Roles, Users, Benefits, and Limitations of Chatbots in Health Care: Rapid Review", journal="J Med Internet Res", year="2024", month="Jul", day="23", volume="26", pages="e56930", keywords="chatbot", keywords="conversational agent", keywords="conversational assistant", keywords="user-computer interface", keywords="digital health", keywords="mobile health", keywords="electronic health", keywords="telehealth", keywords="artificial intelligence", keywords="AI", keywords="health information technology", abstract="Background: Chatbots, or conversational agents, have emerged as significant tools in health care, driven by advancements in artificial intelligence and digital technology. These programs are designed to simulate human conversations, addressing various health care needs. However, no comprehensive synthesis of health care chatbots' roles, users, benefits, and limitations is available to inform future research and application in the field. Objective: This review aims to describe health care chatbots' characteristics, focusing on their diverse roles in the health care pathway, user groups, benefits, and limitations. Methods: A rapid review of published literature from 2017 to 2023 was performed with a search strategy developed in collaboration with a health sciences librarian and implemented in the MEDLINE and Embase databases. Primary research studies reporting on chatbot roles or benefits in health care were included. Two reviewers dual-screened the search results. Extracted data on chatbot roles, users, benefits, and limitations were subjected to content analysis. Results: The review categorized chatbot roles into 2 themes: delivery of remote health services, including patient support, care management, education, skills building, and health behavior promotion, and provision of administrative assistance to health care providers. User groups spanned across patients with chronic conditions as well as patients with cancer; individuals focused on lifestyle improvements; and various demographic groups such as women, families, and older adults. Professionals and students in health care also emerged as significant users, alongside groups seeking mental health support, behavioral change, and educational enhancement. The benefits of health care chatbots were also classified into 2 themes: improvement of health care quality and efficiency and cost-effectiveness in health care delivery. The identified limitations encompassed ethical challenges, medicolegal and safety concerns, technical difficulties, user experience issues, and societal and economic impacts. Conclusions: Health care chatbots offer a wide spectrum of applications, potentially impacting various aspects of health care. While they are promising tools for improving health care efficiency and quality, their integration into the health care system must be approached with consideration of their limitations to ensure optimal, safe, and equitable use. ", doi="10.2196/56930", url="https://www.jmir.org/2024/1/e56930" } @Article{info:doi/10.2196/54793, author="Tolentino, Raymond and Baradaran, Ashkan and Gore, Genevieve and Pluye, Pierre and Abbasgholizadeh-Rahimi, Samira", title="Curriculum Frameworks and Educational Programs in AI for Medical Students, Residents, and Practicing Physicians: Scoping Review", journal="JMIR Med Educ", year="2024", month="Jul", day="18", volume="10", pages="e54793", keywords="artificial intelligence", keywords="machine learning", keywords="curriculum", keywords="framework", keywords="medical education", keywords="review", abstract="Background: The successful integration of artificial intelligence (AI) into clinical practice is contingent upon physicians' comprehension of AI principles and its applications. Therefore, it is essential for medical education curricula to incorporate AI topics and concepts, providing future physicians with the foundational knowledge and skills needed. However, there is a knowledge gap in the current understanding and availability of structured AI curriculum frameworks tailored for medical education, which serve as vital guides for instructing and facilitating the learning process. Objective: The overall aim of this study is to synthesize knowledge from the literature on curriculum frameworks and current educational programs that focus on the teaching and learning of AI for medical students, residents, and practicing physicians. Methods: We followed a validated framework and the Joanna Briggs Institute methodological guidance for scoping reviews. An information specialist performed a comprehensive search from 2000 to May 2023 in the following bibliographic databases: MEDLINE (Ovid), Embase (Ovid), CENTRAL (Cochrane Library), CINAHL (EBSCOhost), and Scopus as well as the gray literature. Papers were limited to English and French languages. This review included papers that describe curriculum frameworks for teaching and learning AI in medicine, irrespective of country. All types of papers and study designs were included, except conference abstracts and protocols. Two reviewers independently screened the titles and abstracts, read the full texts, and extracted data using a validated data extraction form. Disagreements were resolved by consensus, and if this was not possible, the opinion of a third reviewer was sought. We adhered to the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) checklist for reporting the results. Results: Of the 5104 papers screened, 21 papers relevant to our eligibility criteria were identified. In total, 90\% (19/21) of the papers altogether described 30 current or previously offered educational programs, and 10\% (2/21) of the papers described elements of a curriculum framework. One framework describes a general approach to integrating AI curricula throughout the medical learning continuum and another describes a core curriculum for AI in ophthalmology. No papers described a theory, pedagogy, or framework that guided the educational programs. Conclusions: This review synthesizes recent advancements in AI curriculum frameworks and educational programs within the domain of medical education. To build on this foundation, future researchers are encouraged to engage in a multidisciplinary approach to curriculum redesign. In addition, it is encouraged to initiate dialogues on the integration of AI into medical curriculum planning and to investigate the development, deployment, and appraisal of these innovative educational programs. International Registered Report Identifier (IRRID): RR2-10.11124/JBIES-22-00374 ", doi="10.2196/54793", url="https://mededu.jmir.org/2024/1/e54793", url="http://www.ncbi.nlm.nih.gov/pubmed/39023999" } @Article{info:doi/10.2196/51282, author="Jo, Eunbeen and Song, Sanghoun and Kim, Jong-Ho and Lim, Subin and Kim, Hyeon Ju and Cha, Jung-Joon and Kim, Young-Min and Joo, Joon Hyung", title="Assessing GPT-4's Performance in Delivering Medical Advice: Comparative Analysis With Human Experts", journal="JMIR Med Educ", year="2024", month="Jul", day="8", volume="10", pages="e51282", keywords="GPT-4", keywords="medical advice", keywords="ChatGPT", keywords="cardiology", keywords="cardiologist", keywords="heart", keywords="advice", keywords="recommendation", keywords="recommendations", keywords="linguistic", keywords="linguistics", keywords="artificial intelligence", keywords="NLP", keywords="natural language processing", keywords="chatbot", keywords="chatbots", keywords="conversational agent", keywords="conversational agents", keywords="response", keywords="responses", abstract="Background: Accurate medical advice is paramount in ensuring optimal patient care, and misinformation can lead to misguided decisions with potentially detrimental health outcomes. The emergence of large language models (LLMs) such as OpenAI's GPT-4 has spurred interest in their potential health care applications, particularly in automated medical consultation. Yet, rigorous investigations comparing their performance to human experts remain sparse. Objective: This study aims to compare the medical accuracy of GPT-4 with human experts in providing medical advice using real-world user-generated queries, with a specific focus on cardiology. It also sought to analyze the performance of GPT-4 and human experts in specific question categories, including drug or medication information and preliminary diagnoses. Methods: We collected 251 pairs of cardiology-specific questions from general users and answers from human experts via an internet portal. GPT-4 was tasked with generating responses to the same questions. Three independent cardiologists (SL, JHK, and JJC) evaluated the answers provided by both human experts and GPT-4. Using a computer interface, each evaluator compared the pairs and determined which answer was superior, and they quantitatively measured the clarity and complexity of the questions as well as the accuracy and appropriateness of the responses, applying a 3-tiered grading scale (low, medium, and high). Furthermore, a linguistic analysis was conducted to compare the length and vocabulary diversity of the responses using word count and type-token ratio. Results: GPT-4 and human experts displayed comparable efficacy in medical accuracy (``GPT-4 is better'' at 132/251, 52.6\% vs ``Human expert is better'' at 119/251, 47.4\%). In accuracy level categorization, humans had more high-accuracy responses than GPT-4 (50/237, 21.1\% vs 30/238, 12.6\%) but also a greater proportion of low-accuracy responses (11/237, 4.6\% vs 1/238, 0.4\%; P=.001). GPT-4 responses were generally longer and used a less diverse vocabulary than those of human experts, potentially enhancing their comprehensibility for general users (sentence count: mean 10.9, SD 4.2 vs mean 5.9, SD 3.7; P<.001; type-token ratio: mean 0.69, SD 0.07 vs mean 0.79, SD 0.09; P<.001). Nevertheless, human experts outperformed GPT-4 in specific question categories, notably those related to drug or medication information and preliminary diagnoses. These findings highlight the limitations of GPT-4 in providing advice based on clinical experience. Conclusions: GPT-4 has shown promising potential in automated medical consultation, with comparable medical accuracy to human experts. However, challenges remain particularly in the realm of nuanced clinical judgment. Future improvements in LLMs may require the integration of specific clinical reasoning pathways and regulatory oversight for safe use. Further research is needed to understand the full potential of LLMs across various medical specialties and conditions. ", doi="10.2196/51282", url="https://mededu.jmir.org/2024/1/e51282" } @Article{info:doi/10.2196/53308, author="Hassanipour, Soheil and Nayak, Sandeep and Bozorgi, Ali and Keivanlou, Mohammad-Hossein and Dave, Tirth and Alotaibi, Abdulhadi and Joukar, Farahnaz and Mellatdoust, Parinaz and Bakhshi, Arash and Kuriyakose, Dona and Polisetty, D. Lakshmi and Chimpiri, Mallika and Amini-Salehi, Ehsan", title="The Ability of ChatGPT in Paraphrasing Texts and Reducing Plagiarism: A Descriptive Analysis", journal="JMIR Med Educ", year="2024", month="Jul", day="8", volume="10", pages="e53308", keywords="ChatGPT", keywords="paraphrasing", keywords="text generation", keywords="prompts", keywords="academic journals", keywords="plagiarize", keywords="plagiarism", keywords="paraphrase", keywords="wording", keywords="LLM", keywords="LLMs", keywords="language model", keywords="language models", keywords="prompt", keywords="generative", keywords="artificial intelligence", keywords="NLP", keywords="natural language processing", keywords="rephrase", keywords="plagiarizing", keywords="honesty", keywords="integrity", keywords="texts", keywords="text", keywords="textual", keywords="generation", keywords="large language model", keywords="large language models", abstract="Background: The introduction of ChatGPT by OpenAI has garnered significant attention. Among its capabilities, paraphrasing stands out. Objective: This study aims to investigate the satisfactory levels of plagiarism in the paraphrased text produced by this chatbot. Methods: Three texts of varying lengths were presented to ChatGPT. ChatGPT was then instructed to paraphrase the provided texts using five different prompts. In the subsequent stage of the study, the texts were divided into separate paragraphs, and ChatGPT was requested to paraphrase each paragraph individually. Lastly, in the third stage, ChatGPT was asked to paraphrase the texts it had previously generated. Results: The average plagiarism rate in the texts generated by ChatGPT was 45\% (SD 10\%). ChatGPT exhibited a substantial reduction in plagiarism for the provided texts (mean difference ?0.51, 95\% CI ?0.54 to ?0.48; P<.001). Furthermore, when comparing the second attempt with the initial attempt, a significant decrease in the plagiarism rate was observed (mean difference ?0.06, 95\% CI ?0.08 to ?0.03; P<.001). The number of paragraphs in the texts demonstrated a noteworthy association with the percentage of plagiarism, with texts consisting of a single paragraph exhibiting the lowest plagiarism rate (P<.001). Conclusion: Although ChatGPT demonstrates a notable reduction of plagiarism within texts, the existing levels of plagiarism remain relatively high. This underscores a crucial caution for researchers when incorporating this chatbot into their work. ", doi="10.2196/53308", url="https://mededu.jmir.org/2024/1/e53308" } @Article{info:doi/10.2196/58758, author="Shikino, Kiyoshi and Shimizu, Taro and Otsuka, Yuki and Tago, Masaki and Takahashi, Hiromizu and Watari, Takashi and Sasaki, Yosuke and Iizuka, Gemmei and Tamura, Hiroki and Nakashima, Koichi and Kunitomo, Kotaro and Suzuki, Morika and Aoyama, Sayaka and Kosaka, Shintaro and Kawahigashi, Teiko and Matsumoto, Tomohiro and Orihara, Fumina and Morikawa, Toru and Nishizawa, Toshinori and Hoshina, Yoji and Yamamoto, Yu and Matsuo, Yuichiro and Unoki, Yuto and Kimura, Hirofumi and Tokushima, Midori and Watanuki, Satoshi and Saito, Takuma and Otsuka, Fumio and Tokuda, Yasuharu", title="Evaluation of ChatGPT-Generated Differential Diagnosis for Common Diseases With Atypical Presentation: Descriptive Research", journal="JMIR Med Educ", year="2024", month="Jun", day="21", volume="10", pages="e58758", keywords="atypical presentation", keywords="ChatGPT", keywords="common disease", keywords="diagnostic accuracy", keywords="diagnosis", keywords="patient safety", abstract="Background: The persistence of diagnostic errors, despite advances in medical knowledge and diagnostics, highlights the importance of understanding atypical disease presentations and their contribution to mortality and morbidity. Artificial intelligence (AI), particularly generative pre-trained transformers like GPT-4, holds promise for improving diagnostic accuracy, but requires further exploration in handling atypical presentations. Objective: This study aimed to assess the diagnostic accuracy of ChatGPT in generating differential diagnoses for atypical presentations of common diseases, with a focus on the model's reliance on patient history during the diagnostic process. Methods: We used 25 clinical vignettes from the Journal of Generalist Medicine characterizing atypical manifestations of common diseases. Two general medicine physicians categorized the cases based on atypicality. ChatGPT was then used to generate differential diagnoses based on the clinical information provided. The concordance between AI-generated and final diagnoses was measured, with a focus on the top-ranked disease (top 1) and the top 5 differential diagnoses (top 5). Results: ChatGPT's diagnostic accuracy decreased with an increase in atypical presentation. For category 1 (C1) cases, the concordance rates were 17\% (n=1) for the top 1 and 67\% (n=4) for the top 5. Categories 3 (C3) and 4 (C4) showed a 0\% concordance for top 1 and markedly lower rates for the top 5, indicating difficulties in handling highly atypical cases. The $\chi$2 test revealed no significant difference in the top 1 differential diagnosis accuracy between less atypical (C1+C2) and more atypical (C3+C4) groups ($\chi${\texttwosuperior}1=2.07; n=25; P=.13). However, a significant difference was found in the top 5 analyses, with less atypical cases showing higher accuracy ($\chi${\texttwosuperior}1=4.01; n=25; P=.048). Conclusions: ChatGPT-4 demonstrates potential as an auxiliary tool for diagnosing typical and mildly atypical presentations of common diseases. However, its performance declines with greater atypicality. The study findings underscore the need for AI systems to encompass a broader range of linguistic capabilities, cultural understanding, and diverse clinical scenarios to improve diagnostic utility in real-world settings. ", doi="10.2196/58758", url="https://mededu.jmir.org/2024/1/e58758" } @Article{info:doi/10.2196/54987, author="Zhang, Fang and Liu, Xiaoliu and Wu, Wenyan and Zhu, Shiben", title="Evolution of Chatbots in Nursing Education: Narrative Review", journal="JMIR Med Educ", year="2024", month="Jun", day="13", volume="10", pages="e54987", keywords="nursing education", keywords="chatbots", keywords="artificial intelligence", keywords="narrative review", keywords="ChatGPT", abstract="Background: The integration of chatbots in nursing education is a rapidly evolving area with potential transformative impacts. This narrative review aims to synthesize and analyze the existing literature on chatbots in nursing education. Objective: This study aims to comprehensively examine the temporal trends, international distribution, study designs, and implications of chatbots in nursing education. Methods: A comprehensive search was conducted across 3 databases (PubMed, Web of Science, and Embase) following the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) flow diagram. Results: A total of 40 articles met the eligibility criteria, with a notable increase of publications in 2023 (n=28, 70\%). Temporal analysis revealed a notable surge in publications from 2021 to 2023, emphasizing the growing scholarly interest. Geographically, Taiwan province made substantial contributions (n=8, 20\%), followed by the United States (n=6, 15\%) and South Korea (n=4, 10\%). Study designs varied, with reviews (n=8, 20\%) and editorials (n=7, 18\%) being predominant, showcasing the richness of research in this domain. Conclusions: Integrating chatbots into nursing education presents a promising yet relatively unexplored avenue. This review highlights the urgent need for original research, emphasizing the importance of ethical considerations. ", doi="10.2196/54987", url="https://mededu.jmir.org/2024/1/e54987" } @Article{info:doi/10.2196/52105, author="Srinivasan, Muthuvenkatachalam and Venugopal, Ambili and Venkatesan, Latha and Kumar, Rajesh", title="Navigating the Pedagogical Landscape: Exploring the Implications of AI and Chatbots in Nursing Education", journal="JMIR Nursing", year="2024", month="Jun", day="13", volume="7", pages="e52105", keywords="AI", keywords="artificial intelligence", keywords="ChatGPT", keywords="chatbots", keywords="nursing education", keywords="education", keywords="chatbot", keywords="nursing", keywords="ethical", keywords="ethics", keywords="ethical consideration", keywords="accessible", keywords="learning", keywords="efficiency", keywords="student", keywords="student engagement", keywords="student learning", doi="10.2196/52105", url="https://nursing.jmir.org/2024/1/e52105", url="http://www.ncbi.nlm.nih.gov/pubmed/38870516" } @Article{info:doi/10.2196/58355, author="Moldt, Julia-Astrid and Festl-Wietek, Teresa and Fuhl, Wolfgang and Zabel, Susanne and Claassen, Manfred and Wagner, Samuel and Nieselt, Kay and Herrmann-Werner, Anne", title="Assessing AI Awareness and Identifying Essential Competencies: Insights From Key Stakeholders in Integrating AI Into Medical Education", journal="JMIR Med Educ", year="2024", month="Jun", day="12", volume="10", pages="e58355", keywords="AI in medicine", keywords="artificial intelligence", keywords="medical education", keywords="medical students", keywords="qualitative approach", keywords="qualitative analysis", keywords="needs assessment", abstract="Background: The increasing importance of artificial intelligence (AI) in health care has generated a growing need for health care professionals to possess a comprehensive understanding of AI technologies, requiring an adaptation in medical education. Objective: This paper explores stakeholder perceptions and expectations regarding AI in medicine and examines their potential impact on the medical curriculum. This study project aims to assess the AI experiences and awareness of different stakeholders and identify essential AI-related topics in medical education to define necessary competencies for students. Methods: The empirical data were collected as part of the T{\"u}KITZMed project between August 2022 and March 2023, using a semistructured qualitative interview. These interviews were administered to a diverse group of stakeholders to explore their experiences and perspectives of AI in medicine. A qualitative content analysis of the collected data was conducted using MAXQDA software. Results: Semistructured interviews were conducted with 38 participants (6 lecturers, 9 clinicians, 10 students, 6 AI experts, and 7 institutional stakeholders). The qualitative content analysis revealed 6 primary categories with a total of 24 subcategories to answer the research questions. The evaluation of the stakeholders' statements revealed several commonalities and differences regarding their understanding of AI. Crucial identified AI themes based on the main categories were as follows: possible curriculum contents, skills, and competencies; programming skills; curriculum scope; and curriculum structure. Conclusions: The analysis emphasizes integrating AI into medical curricula to ensure students' proficiency in clinical applications. Standardized AI comprehension is crucial for defining and teaching relevant content. Considering diverse perspectives in implementation is essential to comprehensively define AI in the medical context, addressing gaps and facilitating effective solutions for future AI use in medical studies. The results provide insights into potential curriculum content and structure, including aspects of AI in medicine. ", doi="10.2196/58355", url="https://mededu.jmir.org/2024/1/e58355" } @Article{info:doi/10.2196/54507, author="Arango-Ibanez, Pablo Juan and Posso-Nu{\~n}ez, Alejandro Jose and D{\'i}az-Sol{\'o}rzano, Pablo Juan and Cruz-Su{\'a}rez, Gustavo", title="Evidence-Based Learning Strategies in Medicine Using AI", journal="JMIR Med Educ", year="2024", month="May", day="24", volume="10", pages="e54507", keywords="artificial intelligence", keywords="large language models", keywords="ChatGPT", keywords="active recall", keywords="memory cues", keywords="LLMs", keywords="evidence-based", keywords="learning strategy", keywords="medicine", keywords="AI", keywords="medical education", keywords="knowledge", keywords="relevance", doi="10.2196/54507", url="https://mededu.jmir.org/2024/1/e54507" } @Article{info:doi/10.2196/54283, author="Takagi, Soshi and Koda, Masahide and Watari, Takashi", title="The Performance of ChatGPT-4V in Interpreting Images and Tables in the Japanese Medical Licensing Exam", journal="JMIR Med Educ", year="2024", month="May", day="23", volume="10", pages="e54283", keywords="ChatGPT", keywords="medical licensing examination", keywords="generative artificial intelligence", keywords="medical education", keywords="large language model", keywords="images", keywords="tables", keywords="artificial intelligence", keywords="AI", keywords="Japanese", keywords="reliability", keywords="medical application", keywords="medical applications", keywords="diagnostic", keywords="diagnostics", keywords="online data", keywords="web-based data", doi="10.2196/54283", url="https://mededu.jmir.org/2024/1/e54283" } @Article{info:doi/10.2196/55595, author="Wang, Shangqiguo and Mo, Changgeng and Chen, Yuan and Dai, Xiaolu and Wang, Huiyi and Shen, Xiaoli", title="Exploring the Performance of ChatGPT-4 in the Taiwan Audiologist Qualification Examination: Preliminary Observational Study Highlighting the Potential of AI Chatbots in Hearing Care", journal="JMIR Med Educ", year="2024", month="Apr", day="26", volume="10", pages="e55595", keywords="ChatGPT", keywords="medical education", keywords="artificial intelligence", keywords="AI", keywords="audiology", keywords="hearing care", keywords="natural language processing", keywords="large language model", keywords="Taiwan", keywords="hearing", keywords="hearing specialist", keywords="audiologist", keywords="examination", keywords="information accuracy", keywords="educational technology", keywords="healthcare services", keywords="chatbot", keywords="health care services", abstract="Background: Artificial intelligence (AI) chatbots, such as ChatGPT-4, have shown immense potential for application across various aspects of medicine, including medical education, clinical practice, and research. Objective: This study aimed to evaluate the performance of ChatGPT-4 in the 2023 Taiwan Audiologist Qualification Examination, thereby preliminarily exploring the potential utility of AI chatbots in the fields of audiology and hearing care services. Methods: ChatGPT-4 was tasked to provide answers and reasoning for the 2023 Taiwan Audiologist Qualification Examination. The examination encompassed six subjects: (1) basic auditory science, (2) behavioral audiology, (3) electrophysiological audiology, (4) principles and practice of hearing devices, (5) health and rehabilitation of the auditory and balance systems, and (6) auditory and speech communication disorders (including professional ethics). Each subject included 50 multiple-choice questions, with the exception of behavioral audiology, which had 49 questions, amounting to a total of 299 questions. Results: The correct answer rates across the 6 subjects were as follows: 88\% for basic auditory science, 63\% for behavioral audiology, 58\% for electrophysiological audiology, 72\% for principles and practice of hearing devices, 80\% for health and rehabilitation of the auditory and balance systems, and 86\% for auditory and speech communication disorders (including professional ethics). The overall accuracy rate for the 299 questions was 75\%, which surpasses the examination's passing criteria of an average 60\% accuracy rate across all subjects. A comprehensive review of ChatGPT-4's responses indicated that incorrect answers were predominantly due to information errors. Conclusions: ChatGPT-4 demonstrated a robust performance in the Taiwan Audiologist Qualification Examination, showcasing effective logical reasoning skills. Our results suggest that with enhanced information accuracy, ChatGPT-4's performance could be further improved. This study indicates significant potential for the application of AI chatbots in audiology and hearing care services. ", doi="10.2196/55595", url="https://mededu.jmir.org/2024/1/e55595" } @Article{info:doi/10.2196/56764, author="Choudhury, Avishek and Chaudhry, Zaira", title="Large Language Models and User Trust: Consequence of Self-Referential Learning Loop and the Deskilling of Health Care Professionals", journal="J Med Internet Res", year="2024", month="Apr", day="25", volume="26", pages="e56764", keywords="trust", keywords="ChatGPT", keywords="human factors", keywords="healthcare", keywords="LLMs", keywords="large language models", keywords="LLM user trust", keywords="AI accountability", keywords="artificial intelligence", keywords="AI technology", keywords="technologies", keywords="effectiveness", keywords="policy", keywords="medical student", keywords="medical students", keywords="risk factor", keywords="quality of care", keywords="healthcare professional", keywords="healthcare professionals", keywords="human element", doi="10.2196/56764", url="https://www.jmir.org/2024/1/e56764", url="http://www.ncbi.nlm.nih.gov/pubmed/38662419" } @Article{info:doi/10.2196/52483, author="Wu, Yijun and Zheng, Yue and Feng, Baijie and Yang, Yuqi and Kang, Kai and Zhao, Ailin", title="Embracing ChatGPT for Medical Education: Exploring Its Impact on Doctors and Medical Students", journal="JMIR Med Educ", year="2024", month="Apr", day="10", volume="10", pages="e52483", keywords="artificial intelligence", keywords="AI", keywords="ChatGPT", keywords="medical education", keywords="doctors", keywords="medical students", doi="10.2196/52483", url="https://mededu.jmir.org/2024/1/e52483", url="http://www.ncbi.nlm.nih.gov/pubmed/38598263" } @Article{info:doi/10.2196/52674, author="Fukuzawa, Fumitoshi and Yanagita, Yasutaka and Yokokawa, Daiki and Uchida, Shun and Yamashita, Shiho and Li, Yu and Shikino, Kiyoshi and Tsukamoto, Tomoko and Noda, Kazutaka and Uehara, Takanori and Ikusaka, Masatomi", title="Importance of Patient History in Artificial Intelligence--Assisted Medical Diagnosis: Comparison Study", journal="JMIR Med Educ", year="2024", month="Apr", day="8", volume="10", pages="e52674", keywords="medical diagnosis", keywords="ChatGPT", keywords="AI in medicine", keywords="diagnostic accuracy", keywords="patient history", keywords="medical history", keywords="artificial intelligence", keywords="AI", keywords="physical examination", keywords="physical examinations", keywords="laboratory investigation", keywords="laboratory investigations", keywords="mHealth", keywords="accuracy", keywords="public health", keywords="United States", keywords="AI diagnosis", keywords="treatment", keywords="male", keywords="female", keywords="child", keywords="children", keywords="youth", keywords="adolescent", keywords="adolescents", keywords="teen", keywords="teens", keywords="teenager", keywords="teenagers", keywords="older adult", keywords="older adults", keywords="elder", keywords="elderly", keywords="older person", keywords="older people", keywords="investigative", keywords="mobile health", keywords="digital health", abstract="Background: Medical history contributes approximately 80\% to a diagnosis, although physical examinations and laboratory investigations increase a physician's confidence in the medical diagnosis. The concept of artificial intelligence (AI) was first proposed more than 70 years ago. Recently, its role in various fields of medicine has grown remarkably. However, no studies have evaluated the importance of patient history in AI-assisted medical diagnosis. Objective: This study explored the contribution of patient history to AI-assisted medical diagnoses and assessed the accuracy of ChatGPT in reaching a clinical diagnosis based on the medical history provided. Methods: Using clinical vignettes of 30 cases identified in The BMJ, we evaluated the accuracy of diagnoses generated by ChatGPT. We compared the diagnoses made by ChatGPT based solely on medical history with the correct diagnoses. We also compared the diagnoses made by ChatGPT after incorporating additional physical examination findings and laboratory data alongside history with the correct diagnoses. Results: ChatGPT accurately diagnosed 76.6\% (23/30) of the cases with only the medical history, consistent with previous research targeting physicians. We also found that this rate was 93.3\% (28/30) when additional information was included. Conclusions: Although adding additional information improves diagnostic accuracy, patient history remains a significant factor in AI-assisted medical diagnosis. Thus, when using AI in medical diagnosis, it is crucial to include pertinent and correct patient histories for an accurate diagnosis. Our findings emphasize the continued significance of patient history in clinical diagnoses in this age and highlight the need for its integration into AI-assisted medical diagnosis systems. ", doi="10.2196/52674", url="https://mededu.jmir.org/2024/1/e52674" } @Article{info:doi/10.2196/57054, author="Noda, Masao and Ueno, Takayoshi and Koshu, Ryota and Takaso, Yuji and Shimada, Dias Mari and Saito, Chizu and Sugimoto, Hisashi and Fushiki, Hiroaki and Ito, Makoto and Nomura, Akihiro and Yoshizaki, Tomokazu", title="Performance of GPT-4V in Answering the Japanese Otolaryngology Board Certification Examination Questions: Evaluation Study", journal="JMIR Med Educ", year="2024", month="Mar", day="28", volume="10", pages="e57054", keywords="artificial intelligence", keywords="GPT-4v", keywords="large language model", keywords="otolaryngology", keywords="GPT", keywords="ChatGPT", keywords="LLM", keywords="LLMs", keywords="language model", keywords="language models", keywords="head", keywords="respiratory", keywords="ENT: ear", keywords="nose", keywords="throat", keywords="neck", keywords="NLP", keywords="natural language processing", keywords="image", keywords="images", keywords="exam", keywords="exams", keywords="examination", keywords="examinations", keywords="answer", keywords="answers", keywords="answering", keywords="response", keywords="responses", abstract="Background: Artificial intelligence models can learn from medical literature and clinical cases and generate answers that rival human experts. However, challenges remain in the analysis of complex data containing images and diagrams. Objective: This study aims to assess the answering capabilities and accuracy of ChatGPT-4 Vision (GPT-4V) for a set of 100 questions, including image-based questions, from the 2023 otolaryngology board certification examination. Methods: Answers to 100 questions from the 2023 otolaryngology board certification examination, including image-based questions, were generated using GPT-4V. The accuracy rate was evaluated using different prompts, and the presence of images, clinical area of the questions, and variations in the answer content were examined. Results: The accuracy rate for text-only input was, on average, 24.7\% but improved to 47.3\% with the addition of English translation and prompts (P<.001). The average nonresponse rate for text-only input was 46.3\%; this decreased to 2.7\% with the addition of English translation and prompts (P<.001). The accuracy rate was lower for image-based questions than for text-only questions across all types of input, with a relatively high nonresponse rate. General questions and questions from the fields of head and neck allergies and nasal allergies had relatively high accuracy rates, which increased with the addition of translation and prompts. In terms of content, questions related to anatomy had the highest accuracy rate. For all content types, the addition of translation and prompts increased the accuracy rate. As for the performance based on image-based questions, the average of correct answer rate with text-only input was 30.4\%, and that with text-plus-image input was 41.3\% (P=.02). Conclusions: Examination of artificial intelligence's answering capabilities for the otolaryngology board certification examination improves our understanding of its potential and limitations in this field. Although the improvement was noted with the addition of translation and prompts, the accuracy rate for image-based questions was lower than that for text-based questions, suggesting room for improvement in GPT-4V at this stage. Furthermore, text-plus-image input answers a higher rate in image-based questions. Our findings imply the usefulness and potential of GPT-4V in medicine; however, future consideration of safe use methods is needed. ", doi="10.2196/57054", url="https://mededu.jmir.org/2024/1/e57054", url="http://www.ncbi.nlm.nih.gov/pubmed/38546736" } @Article{info:doi/10.2196/49964, author="Gandhi, P. Aravind and Joesph, Karen Felista and Rajagopal, Vineeth and Aparnavi, P. and Katkuri, Sushma and Dayama, Sonal and Satapathy, Prakasini and Khatib, Nazli Mahalaqua and Gaidhane, Shilpa and Zahiruddin, Syed Quazi and Behera, Ashish", title="Performance of ChatGPT on the India Undergraduate Community Medicine Examination: Cross-Sectional Study", journal="JMIR Form Res", year="2024", month="Mar", day="25", volume="8", pages="e49964", keywords="artificial intelligence", keywords="ChatGPT", keywords="community medicine", keywords="India", keywords="large language model", keywords="medical education", keywords="digitalization", abstract="Background: Medical students may increasingly use large language models (LLMs) in their learning. ChatGPT is an LLM at the forefront of this new development in medical education with the capacity to respond to multidisciplinary questions. Objective: The aim of this study was to evaluate the ability of ChatGPT 3.5 to complete the Indian undergraduate medical examination in the subject of community medicine. We further compared ChatGPT scores with the scores obtained by the students. Methods: The study was conducted at a publicly funded medical college in Hyderabad, India. The study was based on the internal assessment examination conducted in January 2023 for students in the Bachelor of Medicine and Bachelor of Surgery Final Year--Part I program; the examination of focus included 40 questions (divided between two papers) from the community medicine subject syllabus. Each paper had three sections with different weightage of marks for each section: section one had two long essay--type questions worth 15 marks each, section two had 8 short essay--type questions worth 5 marks each, and section three had 10 short-answer questions worth 3 marks each. The same questions were administered as prompts to ChatGPT 3.5 and the responses were recorded. Apart from scoring ChatGPT responses, two independent evaluators explored the responses to each question to further analyze their quality with regard to three subdomains: relevancy, coherence, and completeness. Each question was scored in these subdomains on a Likert scale of 1-5. The average of the two evaluators was taken as the subdomain score of the question. The proportion of questions with a score 50\% of the maximum score (5) in each subdomain was calculated. Results: ChatGPT 3.5 scored 72.3\% on paper 1 and 61\% on paper 2. The mean score of the 94 students was 43\% on paper 1 and 45\% on paper 2. The responses of ChatGPT 3.5 were also rated to be satisfactorily relevant, coherent, and complete for most of the questions (>80\%). Conclusions: ChatGPT 3.5 appears to have substantial and sufficient knowledge to understand and answer the Indian medical undergraduate examination in the subject of community medicine. ChatGPT may be introduced to students to enable the self-directed learning of community medicine in pilot mode. However, faculty oversight will be required as ChatGPT is still in the initial stages of development, and thus its potential and reliability of medical content from the Indian context need to be further explored comprehensively. ", doi="10.2196/49964", url="https://formative.jmir.org/2024/1/e49964", url="http://www.ncbi.nlm.nih.gov/pubmed/38526538" } @Article{info:doi/10.2196/51151, author="Magalh{\~a}es Araujo, Sabrina and Cruz-Correia, Ricardo", title="Incorporating ChatGPT in Medical Informatics Education: Mixed Methods Study on Student Perceptions and Experiential Integration Proposals", journal="JMIR Med Educ", year="2024", month="Mar", day="20", volume="10", pages="e51151", keywords="education", keywords="medical informatics", keywords="artificial intelligence", keywords="AI", keywords="generative language model", keywords="ChatGPT", abstract="Background: The integration of artificial intelligence (AI) technologies, such as ChatGPT, in the educational landscape has the potential to enhance the learning experience of medical informatics students and prepare them for using AI in professional settings. The incorporation of AI in classes aims to develop critical thinking by encouraging students to interact with ChatGPT and critically analyze the responses generated by the chatbot. This approach also helps students develop important skills in the field of biomedical and health informatics to enhance their interaction with AI tools. Objective: The aim of the study is to explore the perceptions of students regarding the use of ChatGPT as a learning tool in their educational context and provide professors with examples of prompts for incorporating ChatGPT into their teaching and learning activities, thereby enhancing the educational experience for students in medical informatics courses. Methods: This study used a mixed methods approach to gain insights from students regarding the use of ChatGPT in education. To accomplish this, a structured questionnaire was applied to evaluate students' familiarity with ChatGPT, gauge their perceptions of its use, and understand their attitudes toward its use in academic and learning tasks. Learning outcomes of 2 courses were analyzed to propose ChatGPT's incorporation in master's programs in medicine and medical informatics. Results: The majority of students expressed satisfaction with the use of ChatGPT in education, finding it beneficial for various purposes, including generating academic content, brainstorming ideas, and rewriting text. While some participants raised concerns about potential biases and the need for informed use, the overall perception was positive. Additionally, the study proposed integrating ChatGPT into 2 specific courses in the master's programs in medicine and medical informatics. The incorporation of ChatGPT was envisioned to enhance student learning experiences and assist in project planning, programming code generation, examination preparation, workflow exploration, and technical interview preparation, thus advancing medical informatics education. In medical teaching, it will be used as an assistant for simplifying the explanation of concepts and solving complex problems, as well as for generating clinical narratives and patient simulators. Conclusions: The study's valuable insights into medical faculty students' perspectives and integration proposals for ChatGPT serve as an informative guide for professors aiming to enhance medical informatics education. The research delves into the potential of ChatGPT, emphasizes the necessity of collaboration in academic environments, identifies subject areas with discernible benefits, and underscores its transformative role in fostering innovative and engaging learning experiences. The envisaged proposals hold promise in empowering future health care professionals to work in the rapidly evolving era of digital health care. ", doi="10.2196/51151", url="https://mededu.jmir.org/2024/1/e51151", url="http://www.ncbi.nlm.nih.gov/pubmed/38506920" } @Article{info:doi/10.2196/54393, author="Nakao, Takahiro and Miki, Soichiro and Nakamura, Yuta and Kikuchi, Tomohiro and Nomura, Yukihiro and Hanaoka, Shouhei and Yoshikawa, Takeharu and Abe, Osamu", title="Capability of GPT-4V(ision) in the Japanese National Medical Licensing Examination: Evaluation Study", journal="JMIR Med Educ", year="2024", month="Mar", day="12", volume="10", pages="e54393", keywords="AI", keywords="artificial intelligence", keywords="LLM", keywords="large language model", keywords="language model", keywords="language models", keywords="ChatGPT", keywords="GPT-4", keywords="GPT-4V", keywords="generative pretrained transformer", keywords="image", keywords="images", keywords="imaging", keywords="response", keywords="responses", keywords="exam", keywords="examination", keywords="exams", keywords="examinations", keywords="answer", keywords="answers", keywords="NLP", keywords="natural language processing", keywords="chatbot", keywords="chatbots", keywords="conversational agent", keywords="conversational agents", keywords="medical education", abstract="Background: Previous research applying large language models (LLMs) to medicine was focused on text-based information. Recently, multimodal variants of LLMs acquired the capability of recognizing images. Objective: We aim to evaluate the image recognition capability of generative pretrained transformer (GPT)-4V, a recent multimodal LLM developed by OpenAI, in the medical field by testing how visual information affects its performance to answer questions in the 117th Japanese National Medical Licensing Examination. Methods: We focused on 108 questions that had 1 or more images as part of a question and presented GPT-4V with the same questions under two conditions: (1) with both the question text and associated images and (2) with the question text only. We then compared the difference in accuracy between the 2 conditions using the exact McNemar test. Results: Among the 108 questions with images, GPT-4V's accuracy was 68\% (73/108) when presented with images and 72\% (78/108) when presented without images (P=.36). For the 2 question categories, clinical and general, the accuracies with and those without images were 71\% (70/98) versus 78\% (76/98; P=.21) and 30\% (3/10) versus 20\% (2/10; P?.99), respectively. Conclusions: The additional information from the images did not significantly improve the performance of GPT-4V in the Japanese National Medical Licensing Examination. ", doi="10.2196/54393", url="https://mededu.jmir.org/2024/1/e54393", url="http://www.ncbi.nlm.nih.gov/pubmed/38470459" } @Article{info:doi/10.2196/51426, author="Willms, Amanda and Liu, Sam", title="Exploring the Feasibility of Using ChatGPT to Create Just-in-Time Adaptive Physical Activity mHealth Intervention Content: Case Study", journal="JMIR Med Educ", year="2024", month="Feb", day="29", volume="10", pages="e51426", keywords="ChatGPT", keywords="digital health", keywords="mobile health", keywords="mHealth", keywords="physical activity", keywords="application", keywords="mobile app", keywords="mobile apps", keywords="content creation", keywords="behavior change", keywords="app design", abstract="Background: Achieving physical activity (PA) guidelines' recommendation of 150 minutes of moderate-to-vigorous PA per week has been shown to reduce the risk of many chronic conditions. Despite the overwhelming evidence in this field, PA levels remain low globally. By creating engaging mobile health (mHealth) interventions through strategies such as just-in-time adaptive interventions (JITAIs) that are tailored to an individual's dynamic state, there is potential to increase PA levels. However, generating personalized content can take a long time due to various versions of content required for the personalization algorithms. ChatGPT presents an incredible opportunity to rapidly produce tailored content; however, there is a lack of studies exploring its feasibility. Objective: This study aimed to (1) explore the feasibility of using ChatGPT to create content for a PA JITAI mobile app and (2) describe lessons learned and future recommendations for using ChatGPT in the development of mHealth JITAI content. Methods: During phase 1, we used Pathverse, a no-code app builder, and ChatGPT to develop a JITAI app to help parents support their child's PA levels. The intervention was developed based on the Multi-Process Action Control (M-PAC) framework, and the necessary behavior change techniques targeting the M-PAC constructs were implemented in the app design to help parents support their child's PA. The acceptability of using ChatGPT for this purpose was discussed to determine its feasibility. In phase 2, we summarized the lessons we learned during the JITAI content development process using ChatGPT and generated recommendations to inform future similar use cases. Results: In phase 1, by using specific prompts, we efficiently generated content for 13 lessons relating to increasing parental support for their child's PA following the M-PAC framework. It was determined that using ChatGPT for this case study to develop PA content for a JITAI was acceptable. In phase 2, we summarized our recommendations into the following six steps when using ChatGPT to create content for mHealth behavior interventions: (1) determine target behavior, (2) ground the intervention in behavior change theory, (3) design the intervention structure, (4) input intervention structure and behavior change constructs into ChatGPT, (5) revise the ChatGPT response, and (6) customize the response to be used in the intervention. Conclusions: ChatGPT offers a remarkable opportunity for rapid content creation in the context of an mHealth JITAI. Although our case study demonstrated that ChatGPT was acceptable, it is essential to approach its use, along with other language models, with caution. Before delivering content to population groups, expert review is crucial to ensure accuracy and relevancy. Future research and application of these guidelines are imperative as we deepen our understanding of ChatGPT and its interactions with human input. ", doi="10.2196/51426", url="https://mededu.jmir.org/2024/1/e51426", url="http://www.ncbi.nlm.nih.gov/pubmed/38421689" } @Article{info:doi/10.2196/51523, author="Farhat, Faiza and Chaudhry, Moalla Beenish and Nadeem, Mohammad and Sohail, Saquib Shahab and Madsen, {\O}ivind Dag", title="Evaluating Large Language Models for the National Premedical Exam in India: Comparative Analysis of GPT-3.5, GPT-4, and Bard", journal="JMIR Med Educ", year="2024", month="Feb", day="21", volume="10", pages="e51523", keywords="accuracy", keywords="AI model", keywords="artificial intelligence", keywords="Bard", keywords="ChatGPT", keywords="educational task", keywords="GPT-4", keywords="Generative Pre-trained Transformers", keywords="large language models", keywords="medical education, medical exam", keywords="natural language processing", keywords="performance", keywords="premedical exams", keywords="suitability", abstract="Background: Large language models (LLMs) have revolutionized natural language processing with their ability to generate human-like text through extensive training on large data sets. These models, including Generative Pre-trained Transformers (GPT)-3.5 (OpenAI), GPT-4 (OpenAI), and Bard (Google LLC), find applications beyond natural language processing, attracting interest from academia and industry. Students are actively leveraging LLMs to enhance learning experiences and prepare for high-stakes exams, such as the National Eligibility cum Entrance Test (NEET) in India. Objective: This comparative analysis aims to evaluate the performance of GPT-3.5, GPT-4, and Bard in answering NEET-2023 questions. Methods: In this paper, we evaluated the performance of the 3 mainstream LLMs, namely GPT-3.5, GPT-4, and Google Bard, in answering questions related to the NEET-2023 exam. The questions of the NEET were provided to these artificial intelligence models, and the responses were recorded and compared against the correct answers from the official answer key. Consensus was used to evaluate the performance of all 3 models. Results: It was evident that GPT-4 passed the entrance test with flying colors (300/700, 42.9\%), showcasing exceptional performance. On the other hand, GPT-3.5 managed to meet the qualifying criteria, but with a substantially lower score (145/700, 20.7\%). However, Bard (115/700, 16.4\%) failed to meet the qualifying criteria and did not pass the test. GPT-4 demonstrated consistent superiority over Bard and GPT-3.5 in all 3 subjects. Specifically, GPT-4 achieved accuracy rates of 73\% (29/40) in physics, 44\% (16/36) in chemistry, and 51\% (50/99) in biology. Conversely, GPT-3.5 attained an accuracy rate of 45\% (18/40) in physics, 33\% (13/26) in chemistry, and 34\% (34/99) in biology. The accuracy consensus metric showed that the matching responses between GPT-4 and Bard, as well as GPT-4 and GPT-3.5, had higher incidences of being correct, at 0.56 and 0.57, respectively, compared to the matching responses between Bard and GPT-3.5, which stood at 0.42. When all 3 models were considered together, their matching responses reached the highest accuracy consensus of 0.59. Conclusions: The study's findings provide valuable insights into the performance of GPT-3.5, GPT-4, and Bard in answering NEET-2023 questions. GPT-4 emerged as the most accurate model, highlighting its potential for educational applications. Cross-checking responses across models may result in confusion as the compared models (as duos or a trio) tend to agree on only a little over half of the correct responses. Using GPT-4 as one of the compared models will result in higher accuracy consensus. The results underscore the suitability of LLMs for high-stakes exams and their positive impact on education. Additionally, the study establishes a benchmark for evaluating and enhancing LLMs' performance in educational tasks, promoting responsible and informed use of these models in diverse learning environments. ", doi="10.2196/51523", url="https://mededu.jmir.org/2024/1/e51523", url="http://www.ncbi.nlm.nih.gov/pubmed/38381486" } @Article{info:doi/10.2196/46500, author="Abid, Areeba and Murugan, Avinash and Banerjee, Imon and Purkayastha, Saptarshi and Trivedi, Hari and Gichoya, Judy", title="AI Education for Fourth-Year Medical Students: Two-Year Experience of a Web-Based, Self-Guided Curriculum and Mixed Methods Study", journal="JMIR Med Educ", year="2024", month="Feb", day="20", volume="10", pages="e46500", keywords="medical education", keywords="machine learning", keywords="artificial intelligence", keywords="elective curriculum", keywords="medical student", keywords="student", keywords="students", keywords="elective", keywords="electives", keywords="curricula", keywords="curriculum", keywords="lesson plan", keywords="lesson plans", keywords="educators", keywords="educator", keywords="teacher", keywords="teachers", keywords="teaching", keywords="computer programming", keywords="programming", keywords="coding", keywords="programmer", keywords="programmers", keywords="self guided", keywords="self directed", abstract="Background: Artificial intelligence (AI) and machine learning (ML) are poised to have a substantial impact in the health care space. While a plethora of web-based resources exist to teach programming skills and ML model development, there are few introductory curricula specifically tailored to medical students without a background in data science or programming. Programs that do exist are often restricted to a specific specialty. Objective: We hypothesized that a 1-month elective for fourth-year medical students, composed of high-quality existing web-based resources and a project-based structure, would empower students to learn about the impact of AI and ML in their chosen specialty and begin contributing to innovation in their field of interest. This study aims to evaluate the success of this elective in improving self-reported confidence scores in AI and ML. The authors also share our curriculum with other educators who may be interested in its adoption. Methods: This elective was offered in 2 tracks: technical (for students who were already competent programmers) and nontechnical (with no technical prerequisites, focusing on building a conceptual understanding of AI and ML). Students established a conceptual foundation of knowledge using curated web-based resources and relevant research papers, and were then tasked with completing 3 projects in their chosen specialty: a data set analysis, a literature review, and an AI project proposal. The project-based nature of the elective was designed to be self-guided and flexible to each student's interest area and career goals. Students' success was measured by self-reported confidence in AI and ML skills in pre and postsurveys. Qualitative feedback on students' experiences was also collected. Results: This web-based, self-directed elective was offered on a pass-or-fail basis each month to fourth-year students at Emory University School of Medicine beginning in May 2021. As of June 2022, a total of 19 students had successfully completed the elective, representing a wide range of chosen specialties: diagnostic radiology (n=3), general surgery (n=1), internal medicine (n=5), neurology (n=2), obstetrics and gynecology (n=1), ophthalmology (n=1), orthopedic surgery (n=1), otolaryngology (n=2), pathology (n=2), and pediatrics (n=1). Students' self-reported confidence scores for AI and ML rose by 66\% after this 1-month elective. In qualitative surveys, students overwhelmingly reported enthusiasm and satisfaction with the course and commented that the self-direction and flexibility and the project-based design of the course were essential. Conclusions: Course participants were successful in diving deep into applications of AI in their widely-ranging specialties, produced substantial project deliverables, and generally reported satisfaction with their elective experience. The authors are hopeful that a brief, 1-month investment in AI and ML education during medical school will empower this next generation of physicians to pave the way for AI and ML innovation in health care. ", doi="10.2196/46500", url="https://mededu.jmir.org/2024/1/e46500", url="http://www.ncbi.nlm.nih.gov/pubmed/38376896" } @Article{info:doi/10.2196/55368, author="Weidener, Lukas and Fischer, Michael", title="Proposing a Principle-Based Approach for Teaching AI Ethics in Medical Education", journal="JMIR Med Educ", year="2024", month="Feb", day="9", volume="10", pages="e55368", keywords="artificial intelligence", keywords="AI", keywords="ethics", keywords="artificial intelligence ethics", keywords="AI ethics", keywords="medical education", keywords="medicine", keywords="medical artificial intelligence ethics", keywords="medical AI ethics", keywords="medical ethics", keywords="public health ethics", doi="10.2196/55368", url="https://mededu.jmir.org/2024/1/e55368", url="http://www.ncbi.nlm.nih.gov/pubmed/38285931" } @Article{info:doi/10.2196/50705, author="Gray, Megan and Baird, Austin and Sawyer, Taylor and James, Jasmine and DeBroux, Thea and Bartlett, Michelle and Krick, Jeanne and Umoren, Rachel", title="Increasing Realism and Variety of Virtual Patient Dialogues for Prenatal Counseling Education Through a Novel Application of ChatGPT: Exploratory Observational Study", journal="JMIR Med Educ", year="2024", month="Feb", day="1", volume="10", pages="e50705", keywords="prenatal counseling", keywords="virtual health", keywords="virtual patient", keywords="simulation", keywords="neonatology", keywords="ChatGPT", keywords="AI", keywords="artificial intelligence", abstract="Background: Using virtual patients, facilitated by natural language processing, provides a valuable educational experience for learners. Generating a large, varied sample of realistic and appropriate responses for virtual patients is challenging. Artificial intelligence (AI) programs can be a viable source for these responses, but their utility for this purpose has not been explored. Objective: In this study, we explored the effectiveness of generative AI (ChatGPT) in developing realistic virtual standardized patient dialogues to teach prenatal counseling skills. Methods: ChatGPT was prompted to generate a list of common areas of concern and questions that families expecting preterm delivery at 24 weeks gestation might ask during prenatal counseling. ChatGPT was then prompted to generate 2 role-plays with dialogues between a parent expecting a potential preterm delivery at 24 weeks and their counseling physician using each of the example questions. The prompt was repeated for 2 unique role-plays: one parent was characterized as anxious and the other as having low trust in the medical system. Role-play scripts were exported verbatim and independently reviewed by 2 neonatologists with experience in prenatal counseling, using a scale of 1-5 on realism, appropriateness, and utility for virtual standardized patient responses. Results: ChatGPT generated 7 areas of concern, with 35 example questions used to generate role-plays. The 35 role-play transcripts generated 176 unique parent responses (median 5, IQR 4-6, per role-play) with 268 unique sentences. Expert review identified 117 (65\%) of the 176 responses as indicating an emotion, either directly or indirectly. Approximately half (98/176, 56\%) of the responses had 2 or more sentences, and half (88/176, 50\%) included at least 1 question. More than half (104/176, 58\%) of the responses from role-played parent characters described a feeling, such as being scared, worried, or concerned. The role-plays of parents with low trust in the medical system generated many unique sentences (n=50). Most of the sentences in the responses were found to be reasonably realistic (214/268, 80\%), appropriate for variable prenatal counseling conversation paths (233/268, 87\%), and usable without more than a minimal modification in a virtual patient program (169/268, 63\%). Conclusions: Generative AI programs, such as ChatGPT, may provide a viable source of training materials to expand virtual patient programs, with careful attention to the concerns and questions of patients and families. Given the potential for unrealistic or inappropriate statements and questions, an expert should review AI chat outputs before deploying them in an educational program. ", doi="10.2196/50705", url="https://mededu.jmir.org/2024/1/e50705", url="http://www.ncbi.nlm.nih.gov/pubmed/38300696" } @Article{info:doi/10.2196/50842, author="Haddad, Firas and Saade, S. Joanna", title="Performance of ChatGPT on Ophthalmology-Related Questions Across Various Examination Levels: Observational Study", journal="JMIR Med Educ", year="2024", month="Jan", day="18", volume="10", pages="e50842", keywords="ChatGPT", keywords="artificial intelligence", keywords="AI", keywords="board examinations", keywords="ophthalmology", keywords="testing", abstract="Background: ChatGPT and language learning models have gained attention recently for their ability to answer questions on various examinations across various disciplines. The question of whether ChatGPT could be used to aid in medical education is yet to be answered, particularly in the field of ophthalmology. Objective: The aim of this study is to assess the ability of ChatGPT-3.5 (GPT-3.5) and ChatGPT-4.0 (GPT-4.0) to answer ophthalmology-related questions across different levels of ophthalmology training. Methods: Questions from the United States Medical Licensing Examination (USMLE) steps 1 (n=44), 2 (n=60), and 3 (n=28) were extracted from AMBOSS, and 248 questions (64 easy, 122 medium, and 62 difficult questions) were extracted from the book, Ophthalmology Board Review Q\&A, for the Ophthalmic Knowledge Assessment Program and the Board of Ophthalmology (OB) Written Qualifying Examination (WQE). Questions were prompted identically and inputted to GPT-3.5 and GPT-4.0. Results: GPT-3.5 achieved a total of 55\% (n=210) of correct answers, while GPT-4.0 achieved a total of 70\% (n=270) of correct answers. GPT-3.5 answered 75\% (n=33) of questions correctly in USMLE step 1, 73.33\% (n=44) in USMLE step 2, 60.71\% (n=17) in USMLE step 3, and 46.77\% (n=116) in the OB-WQE. GPT-4.0 answered 70.45\% (n=31) of questions correctly in USMLE step 1, 90.32\% (n=56) in USMLE step 2, 96.43\% (n=27) in USMLE step 3, and 62.90\% (n=156) in the OB-WQE. GPT-3.5 performed poorer as examination levels advanced (P<.001), while GPT-4.0 performed better on USMLE steps 2 and 3 and worse on USMLE step 1 and the OB-WQE (P<.001). The coefficient of correlation (r) between ChatGPT answering correctly and human users answering correctly was 0.21 (P=.01) for GPT-3.5 as compared to --0.31 (P<.001) for GPT-4.0. GPT-3.5 performed similarly across difficulty levels, while GPT-4.0 performed more poorly with an increase in the difficulty level. Both GPT models performed significantly better on certain topics than on others. Conclusions: ChatGPT is far from being considered a part of mainstream medical education. Future models with higher accuracy are needed for the platform to be effective in medical education. ", doi="10.2196/50842", url="https://mededu.jmir.org/2024/1/e50842", url="http://www.ncbi.nlm.nih.gov/pubmed/38236632" } @Article{info:doi/10.2196/51388, author="Kuo, I-Hsien Nicholas and Perez-Concha, Oscar and Hanly, Mark and Mnatzaganian, Emmanuel and Hao, Brandon and Di Sipio, Marcus and Yu, Guolin and Vanjara, Jash and Valerie, Cerelia Ivy and de Oliveira Costa, Juliana and Churches, Timothy and Lujic, Sanja and Hegarty, Jo and Jorm, Louisa and Barbieri, Sebastiano", title="Enriching Data Science and Health Care Education: Application and Impact of Synthetic Data Sets Through the Health Gym Project", journal="JMIR Med Educ", year="2024", month="Jan", day="16", volume="10", pages="e51388", keywords="medical education", keywords="generative model", keywords="generative adversarial networks", keywords="privacy", keywords="antiretroviral therapy (ART)", keywords="human immunodeficiency virus (HIV)", keywords="data science", keywords="educational purposes", keywords="accessibility", keywords="data privacy", keywords="data sets", keywords="sepsis", keywords="hypotension", keywords="HIV", keywords="science education", keywords="health care AI", doi="10.2196/51388", url="https://mededu.jmir.org/2024/1/e51388", url="http://www.ncbi.nlm.nih.gov/pubmed/38227356" } @Article{info:doi/10.2196/51148, author="Knoedler, Leonard and Alfertshofer, Michael and Knoedler, Samuel and Hoch, C. Cosima and Funk, F. Paul and Cotofana, Sebastian and Maheta, Bhagvat and Frank, Konstantin and Br{\'e}bant, Vanessa and Prantl, Lukas and Lamby, Philipp", title="Pure Wisdom or Potemkin Villages? A Comparison of ChatGPT 3.5 and ChatGPT 4 on USMLE Step 3 Style Questions: Quantitative Analysis", journal="JMIR Med Educ", year="2024", month="Jan", day="5", volume="10", pages="e51148", keywords="ChatGPT", keywords="United States Medical Licensing Examination", keywords="artificial intelligence", keywords="USMLE", keywords="USMLE Step 1", keywords="OpenAI", keywords="medical education", keywords="clinical decision-making", abstract="Background: The United States Medical Licensing Examination (USMLE) has been critical in medical education since 1992, testing various aspects of a medical student's knowledge and skills through different steps, based on their training level. Artificial intelligence (AI) tools, including chatbots like ChatGPT, are emerging technologies with potential applications in medicine. However, comprehensive studies analyzing ChatGPT's performance on USMLE Step 3 in large-scale scenarios and comparing different versions of ChatGPT are limited. Objective: This paper aimed to analyze ChatGPT's performance on USMLE Step 3 practice test questions to better elucidate the strengths and weaknesses of AI use in medical education and deduce evidence-based strategies to counteract AI cheating. Methods: A total of 2069 USMLE Step 3 practice questions were extracted from the AMBOSS study platform. After including 229 image-based questions, a total of 1840 text-based questions were further categorized and entered into ChatGPT 3.5, while a subset of 229 questions were entered into ChatGPT 4. Responses were recorded, and the accuracy of ChatGPT answers as well as its performance in different test question categories and for different difficulty levels were compared between both versions. Results: Overall, ChatGPT 4 demonstrated a statistically significant superior performance compared to ChatGPT 3.5, achieving an accuracy of 84.7\% (194/229) and 56.9\% (1047/1840), respectively. A noteworthy correlation was observed between the length of test questions and the performance of ChatGPT 3.5 ($\rho$=--0.069; P=.003), which was absent in ChatGPT 4 (P=.87). Additionally, the difficulty of test questions, as categorized by AMBOSS hammer ratings, showed a statistically significant correlation with performance for both ChatGPT versions, with $\rho$=--0.289 for ChatGPT 3.5 and $\rho$=--0.344 for ChatGPT 4. ChatGPT 4 surpassed ChatGPT 3.5 in all levels of test question difficulty, except for the 2 highest difficulty tiers (4 and 5 hammers), where statistical significance was not reached. Conclusions: In this study, ChatGPT 4 demonstrated remarkable proficiency in taking the USMLE Step 3, with an accuracy rate of 84.7\% (194/229), outshining ChatGPT 3.5 with an accuracy rate of 56.9\% (1047/1840). Although ChatGPT 4 performed exceptionally, it encountered difficulties in questions requiring the application of theoretical concepts, particularly in cardiology and neurology. These insights are pivotal for the development of examination strategies that are resilient to AI and underline the promising role of AI in the realm of medical education and diagnostics. ", doi="10.2196/51148", url="https://mededu.jmir.org/2024/1/e51148", url="http://www.ncbi.nlm.nih.gov/pubmed/38180782" } @Article{info:doi/10.2196/52202, author="Watari, Takashi and Takagi, Soshi and Sakaguchi, Kota and Nishizaki, Yuji and Shimizu, Taro and Yamamoto, Yu and Tokuda, Yasuharu", title="Performance Comparison of ChatGPT-4 and Japanese Medical Residents in the General Medicine In-Training Examination: Comparison Study", journal="JMIR Med Educ", year="2023", month="Dec", day="6", volume="9", pages="e52202", keywords="ChatGPT", keywords="artificial intelligence", keywords="medical education", keywords="clinical training", keywords="non-English language", keywords="ChatGPT-4", keywords="Japan", keywords="Japanese", keywords="Asia", keywords="Asian", keywords="exam", keywords="examination", keywords="exams", keywords="examinations", keywords="NLP", keywords="natural language processing", keywords="LLM", keywords="language model", keywords="language models", keywords="performance", keywords="response", keywords="responses", keywords="answer", keywords="answers", keywords="chatbot", keywords="chatbots", keywords="conversational agent", keywords="conversational agents", keywords="reasoning", keywords="clinical", keywords="GM-ITE", keywords="self-assessment", keywords="residency programs", abstract="Background: The reliability of GPT-4, a state-of-the-art expansive language model specializing in clinical reasoning and medical knowledge, remains largely unverified across non-English languages. Objective: This study aims to compare fundamental clinical competencies between Japanese residents and GPT-4 by using the General Medicine In-Training Examination (GM-ITE). Methods: We used the GPT-4 model provided by OpenAI and the GM-ITE examination questions for the years 2020, 2021, and 2022 to conduct a comparative analysis. This analysis focused on evaluating the performance of individuals who were concluding their second year of residency in comparison to that of GPT-4. Given the current abilities of GPT-4, our study included only single-choice exam questions, excluding those involving audio, video, or image data. The assessment included 4 categories: general theory (professionalism and medical interviewing), symptomatology and clinical reasoning, physical examinations and clinical procedures, and specific diseases. Additionally, we categorized the questions into 7 specialty fields and 3 levels of difficulty, which were determined based on residents' correct response rates. Results: Upon examination of 137 GM-ITE questions in Japanese, GPT-4 scores were significantly higher than the mean scores of residents (residents: 55.8\%, GPT-4: 70.1\%; P<.001). In terms of specific disciplines, GPT-4 scored 23.5 points higher in the ``specific diseases,'' 30.9 points higher in ``obstetrics and gynecology,'' and 26.1 points higher in ``internal medicine.'' In contrast, GPT-4 scores in ``medical interviewing and professionalism,'' ``general practice,'' and ``psychiatry'' were lower than those of the residents, although this discrepancy was not statistically significant. Upon analyzing scores based on question difficulty, GPT-4 scores were 17.2 points lower for easy problems (P=.007) but were 25.4 and 24.4 points higher for normal and difficult problems, respectively (P<.001). In year-on-year comparisons, GPT-4 scores were 21.7 and 21.5 points higher in the 2020 (P=.01) and 2022 (P=.003) examinations, respectively, but only 3.5 points higher in the 2021 examinations (no significant difference). Conclusions: In the Japanese language, GPT-4 also outperformed the average medical residents in the GM-ITE test, originally designed for them. Specifically, GPT-4 demonstrated a tendency to score higher on difficult questions with low resident correct response rates and those demanding a more comprehensive understanding of diseases. However, GPT-4 scored comparatively lower on questions that residents could readily answer, such as those testing attitudes toward patients and professionalism, as well as those necessitating an understanding of context and communication. These findings highlight the strengths and limitations of artificial intelligence applications in medical education and practice. ", doi="10.2196/52202", url="https://mededu.jmir.org/2023/1/e52202", url="http://www.ncbi.nlm.nih.gov/pubmed/38055323" } @Article{info:doi/10.2196/53466, author="Shimizu, Ikuo and Kasai, Hajime and Shikino, Kiyoshi and Araki, Nobuyuki and Takahashi, Zaiya and Onodera, Misaki and Kimura, Yasuhiko and Tsukamoto, Tomoko and Yamauchi, Kazuyo and Asahina, Mayumi and Ito, Shoichi and Kawakami, Eiryo", title="Developing Medical Education Curriculum Reform Strategies to Address the Impact of Generative AI: Qualitative Study", journal="JMIR Med Educ", year="2023", month="Nov", day="30", volume="9", pages="e53466", keywords="artificial intelligence", keywords="curriculum reform", keywords="generative artificial intelligence", keywords="large language models", keywords="medical education", keywords="qualitative analysis", keywords="strengths-weaknesses-opportunities-threats (SWOT) framework", abstract="Background: Generative artificial intelligence (GAI), represented by large language models, have the potential to transform health care and medical education. In particular, GAI's impact on higher education has the potential to change students' learning experience as well as faculty's teaching. However, concerns have been raised about ethical consideration and decreased reliability of the existing examinations. Furthermore, in medical education, curriculum reform is required to adapt to the revolutionary changes brought about by the integration of GAI into medical practice and research. Objective: This study analyzes the impact of GAI on medical education curricula and explores strategies for adaptation. Methods: The study was conducted in the context of faculty development at a medical school in Japan. A workshop involving faculty and students was organized, and participants were divided into groups to address two research questions: (1) How does GAI affect undergraduate medical education curricula? and (2) How should medical school curricula be reformed to address the impact of GAI? The strength, weakness, opportunity, and threat (SWOT) framework was used, and cross-SWOT matrix analysis was used to devise strategies. Further, 4 researchers conducted content analysis on the data generated during the workshop discussions. Results: The data were collected from 8 groups comprising 55 participants. Further, 5 themes about the impact of GAI on medical education curricula emerged: improvement of teaching and learning, improved access to information, inhibition of existing learning processes, problems in GAI, and changes in physicians' professionality. Positive impacts included enhanced teaching and learning efficiency and improved access to information, whereas negative impacts included concerns about reduced independent thinking and the adaptability of existing assessment methods. Further, GAI was perceived to change the nature of physicians' expertise. Three themes emerged from the cross-SWOT analysis for curriculum reform: (1) learning about GAI, (2) learning with GAI, and (3) learning aside from GAI. Participants recommended incorporating GAI literacy, ethical considerations, and compliance into the curriculum. Learning with GAI involved improving learning efficiency, supporting information gathering and dissemination, and facilitating patient involvement. Learning aside from GAI emphasized maintaining GAI-free learning processes, fostering higher cognitive domains of learning, and introducing more communication exercises. Conclusions: This study highlights the profound impact of GAI on medical education curricula and provides insights into curriculum reform strategies. Participants recognized the need for GAI literacy, ethical education, and adaptive learning. Further, GAI was recognized as a tool that can enhance efficiency and involve patients in education. The study also suggests that medical education should focus on competencies that GAI hardly replaces, such as clinical experience and communication. Notably, involving both faculty and students in curriculum reform discussions fosters a sense of ownership and ensures broader perspectives are encompassed. ", doi="10.2196/53466", url="https://mededu.jmir.org/2023/1/e53466", url="http://www.ncbi.nlm.nih.gov/pubmed/38032695" } @Article{info:doi/10.2196/47191, author="Surapaneni, Mohan Krishna", title="Assessing the Performance of ChatGPT in Medical Biochemistry Using Clinical Case Vignettes: Observational Study", journal="JMIR Med Educ", year="2023", month="Nov", day="7", volume="9", pages="e47191", keywords="ChatGPT", keywords="artificial intelligence", keywords="medical education", keywords="medical Biochemistry", keywords="biochemistry", keywords="chatbot", keywords="case study", keywords="case scenario", keywords="medical exam", keywords="medical examination", keywords="computer generated", abstract="Background: ChatGPT has gained global attention recently owing to its high performance in generating a wide range of information and retrieving any kind of data instantaneously. ChatGPT has also been tested for the United States Medical Licensing Examination (USMLE) and has successfully cleared it. Thus, its usability in medical education is now one of the key discussions worldwide. Objective: The objective of this study is to evaluate the performance of ChatGPT in medical biochemistry using clinical case vignettes. Methods: The performance of ChatGPT was evaluated in medical biochemistry using 10 clinical case vignettes. Clinical case vignettes were randomly selected and inputted in ChatGPT along with the response options. We tested the responses for each clinical case twice. The answers generated by ChatGPT were saved and checked using our reference material. Results: ChatGPT generated correct answers for 4 questions on the first attempt. For the other cases, there were differences in responses generated by ChatGPT in the first and second attempts. In the second attempt, ChatGPT provided correct answers for 6 questions and incorrect answers for 4 questions out of the 10 cases that were used. But, to our surprise, for case 3, different answers were obtained with multiple attempts. We believe this to have happened owing to the complexity of the case, which involved addressing various critical medical aspects related to amino acid metabolism in a balanced approach. Conclusions: According to the findings of our study, ChatGPT may not be considered an accurate information provider for application in medical education to improve learning and assessment. However, our study was limited by a small sample size (10 clinical case vignettes) and the use of the publicly available version of ChatGPT (version 3.5). Although artificial intelligence (AI) has the capability to transform medical education, we emphasize the validation of such data produced by such AI systems for correctness and dependability before it could be implemented in practice. ", doi="10.2196/47191", url="https://mededu.jmir.org/2023/1/e47191", url="http://www.ncbi.nlm.nih.gov/pubmed/37934568" } @Article{info:doi/10.2196/47532, author="Ito, Naoki and Kadomatsu, Sakina and Fujisawa, Mineto and Fukaguchi, Kiyomitsu and Ishizawa, Ryo and Kanda, Naoki and Kasugai, Daisuke and Nakajima, Mikio and Goto, Tadahiro and Tsugawa, Yusuke", title="The Accuracy and Potential Racial and Ethnic Biases of GPT-4 in the Diagnosis and Triage of Health Conditions: Evaluation Study", journal="JMIR Med Educ", year="2023", month="Nov", day="2", volume="9", pages="e47532", keywords="GPT-4", keywords="racial and ethnic bias", keywords="typical clinical vignettes", keywords="diagnosis", keywords="triage", keywords="artificial intelligence", keywords="AI", keywords="race", keywords="clinical vignettes", keywords="physician", keywords="efficiency", keywords="decision-making", keywords="bias", keywords="GPT", abstract="Background: Whether GPT-4, the conversational artificial intelligence, can accurately diagnose and triage health conditions and whether it presents racial and ethnic biases in its decisions remain unclear. Objective: We aim to assess the accuracy of GPT-4 in the diagnosis and triage of health conditions and whether its performance varies by patient race and ethnicity. Methods: We compared the performance of GPT-4 and physicians, using 45 typical clinical vignettes, each with a correct diagnosis and triage level, in February and March 2023. For each of the 45 clinical vignettes, GPT-4 and 3 board-certified physicians provided the most likely primary diagnosis and triage level (emergency, nonemergency, or self-care). Independent reviewers evaluated the diagnoses as ``correct'' or ``incorrect.'' Physician diagnosis was defined as the consensus of the 3 physicians. We evaluated whether the performance of GPT-4 varies by patient race and ethnicity, by adding the information on patient race and ethnicity to the clinical vignettes. Results: The accuracy of diagnosis was comparable between GPT-4 and physicians (the percentage of correct diagnosis was 97.8\% (44/45; 95\% CI 88.2\%-99.9\%) for GPT-4 and 91.1\% (41/45; 95\% CI 78.8\%-97.5\%) for physicians; P=.38). GPT-4 provided appropriate reasoning for 97.8\% (44/45) of the vignettes. The appropriateness of triage was comparable between GPT-4 and physicians (GPT-4: 30/45, 66.7\%; 95\% CI 51.0\%-80.0\%; physicians: 30/45, 66.7\%; 95\% CI 51.0\%-80.0\%; P=.99). The performance of GPT-4 in diagnosing health conditions did not vary among different races and ethnicities (Black, White, Asian, and Hispanic), with an accuracy of 100\% (95\% CI 78.2\%-100\%). P values, compared to the GPT-4 output without incorporating race and ethnicity information, were all .99. The accuracy of triage was not significantly different even if patients' race and ethnicity information was added. The accuracy of triage was 62.2\% (95\% CI 46.5\%-76.2\%; P=.50) for Black patients; 66.7\% (95\% CI 51.0\%-80.0\%; P=.99) for White patients; 66.7\% (95\% CI 51.0\%-80.0\%; P=.99) for Asian patients, and 62.2\% (95\% CI 46.5\%-76.2\%; P=.69) for Hispanic patients. P values were calculated by comparing the outputs with and without conditioning on race and ethnicity. Conclusions: GPT-4's ability to diagnose and triage typical clinical vignettes was comparable to that of board-certified physicians. The performance of GPT-4 did not vary by patient race and ethnicity. These findings should be informative for health systems looking to introduce conversational artificial intelligence to improve the efficiency of patient diagnosis and triage. ", doi="10.2196/47532", url="https://mededu.jmir.org/2023/1/e47532", url="http://www.ncbi.nlm.nih.gov/pubmed/37917120" } @Article{info:doi/10.2196/51421, author="Baglivo, Francesco and De Angelis, Luigi and Casigliani, Virginia and Arzilli, Guglielmo and Privitera, Pierpaolo Gaetano and Rizzo, Caterina", title="Exploring the Possible Use of AI Chatbots in Public Health Education: Feasibility Study", journal="JMIR Med Educ", year="2023", month="Nov", day="1", volume="9", pages="e51421", keywords="artificial intelligence", keywords="chatbots", keywords="medical education", keywords="vaccination", keywords="public health", keywords="medical students", keywords="large language model", keywords="generative AI", keywords="ChatGPT", keywords="Google Bard", keywords="AI chatbot", keywords="health education", keywords="health care", keywords="medical training", keywords="educational support tool", keywords="chatbot model", abstract="Background: Artificial intelligence (AI) is a rapidly developing field with the potential to transform various aspects of health care and public health, including medical training. During the ``Hygiene and Public Health'' course for fifth-year medical students, a practical training session was conducted on vaccination using AI chatbots as an educational supportive tool. Before receiving specific training on vaccination, the students were given a web-based test extracted from the Italian National Medical Residency Test. After completing the test, a critical correction of each question was performed assisted by AI chatbots. Objective: The main aim of this study was to identify whether AI chatbots can be considered educational support tools for training in public health. The secondary objective was to assess the performance of different AI chatbots on complex multiple-choice medical questions in the Italian language. Methods: A test composed of 15 multiple-choice questions on vaccination was extracted from the Italian National Medical Residency Test using targeted keywords and administered to medical students via Google Forms and to different AI chatbot models (Bing Chat, ChatGPT, Chatsonic, Google Bard, and YouChat). The correction of the test was conducted in the classroom, focusing on the critical evaluation of the explanations provided by the chatbot. A Mann-Whitney U test was conducted to compare the performances of medical students and AI chatbots. Student feedback was collected anonymously at the end of the training experience. Results: In total, 36 medical students and 5 AI chatbot models completed the test. The students achieved an average score of 8.22 (SD 2.65) out of 15, while the AI chatbots scored an average of 12.22 (SD 2.77). The results indicated a statistically significant difference in performance between the 2 groups (U=49.5, P<.001), with a large effect size (r=0.69). When divided by question type (direct, scenario-based, and negative), significant differences were observed in direct (P<.001) and scenario-based (P<.001) questions, but not in negative questions (P=.48). The students reported a high level of satisfaction (7.9/10) with the educational experience, expressing a strong desire to repeat the experience (7.6/10). Conclusions: This study demonstrated the efficacy of AI chatbots in answering complex medical questions related to vaccination and providing valuable educational support. Their performance significantly surpassed that of medical students in direct and scenario-based questions. The responsible and critical use of AI chatbots can enhance medical education, making it an essential aspect to integrate into the educational system. ", doi="10.2196/51421", url="https://mededu.jmir.org/2023/1/e51421", url="http://www.ncbi.nlm.nih.gov/pubmed/37910155" } @Article{info:doi/10.2196/48785, author="Preiksaitis, Carl and Rose, Christian", title="Opportunities, Challenges, and Future Directions of Generative Artificial Intelligence in Medical Education: Scoping Review", journal="JMIR Med Educ", year="2023", month="Oct", day="20", volume="9", pages="e48785", keywords="medical education", keywords="artificial intelligence", keywords="ChatGPT", keywords="Bard", keywords="AI", keywords="educator", keywords="scoping", keywords="review", keywords="learner", keywords="generative", abstract="Background: Generative artificial intelligence (AI) technologies are increasingly being utilized across various fields, with considerable interest and concern regarding their potential application in medical education. These technologies, such as Chat GPT and Bard, can generate new content and have a wide range of possible applications. Objective: This study aimed to synthesize the potential opportunities and limitations of generative AI in medical education. It sought to identify prevalent themes within recent literature regarding potential applications and challenges of generative AI in medical education and use these to guide future areas for exploration. Methods: We conducted a scoping review, following the framework by Arksey and O'Malley, of English language articles published from 2022 onward that discussed generative AI in the context of medical education. A literature search was performed using PubMed, Web of Science, and Google Scholar databases. We screened articles for inclusion, extracted data from relevant studies, and completed a quantitative and qualitative synthesis of the data. Results: Thematic analysis revealed diverse potential applications for generative AI in medical education, including self-directed learning, simulation scenarios, and writing assistance. However, the literature also highlighted significant challenges, such as issues with academic integrity, data accuracy, and potential detriments to learning. Based on these themes and the current state of the literature, we propose the following 3 key areas for investigation: developing learners' skills to evaluate AI critically, rethinking assessment methodology, and studying human-AI interactions. Conclusions: The integration of generative AI in medical education presents exciting opportunities, alongside considerable challenges. There is a need to develop new skills and competencies related to AI as well as thoughtful, nuanced approaches to examine the growing use of generative AI in medical education. ", doi="10.2196/48785", url="https://mededu.jmir.org/2023/1/e48785/" } @Article{info:doi/10.2196/48249, author="Chen, Yanhua and Wu, Ziye and Wang, Peicheng and Xie, Linbo and Yan, Mengsha and Jiang, Maoqing and Yang, Zhenghan and Zheng, Jianjun and Zhang, Jingfeng and Zhu, Jiming", title="Radiology Residents' Perceptions of Artificial Intelligence: Nationwide Cross-Sectional Survey Study", journal="J Med Internet Res", year="2023", month="Oct", day="19", volume="25", pages="e48249", keywords="artificial intelligence", keywords="technology acceptance", keywords="radiology", keywords="residency", keywords="perceptions", keywords="health care services", keywords="resident", keywords="residents", keywords="perception", keywords="adoption", keywords="readiness", keywords="acceptance", keywords="cross sectional", keywords="survey", abstract="Background: Artificial intelligence (AI) is transforming various fields, with health care, especially diagnostic specialties such as radiology, being a key but controversial battleground. However, there is limited research systematically examining the response of ``human intelligence'' to AI. Objective: This study aims to comprehend radiologists' perceptions regarding AI, including their views on its potential to replace them, its usefulness, and their willingness to accept it. We examine the influence of various factors, encompassing demographic characteristics, working status, psychosocial aspects, personal experience, and contextual factors. Methods: Between December 1, 2020, and April 30, 2021, a cross-sectional survey was completed by 3666 radiology residents in China. We used multivariable logistic regression models to examine factors and associations, reporting odds ratios (ORs) and 95\% CIs. Results: In summary, radiology residents generally hold a positive attitude toward AI, with 29.90\% (1096/3666) agreeing that AI may reduce the demand for radiologists, 72.80\% (2669/3666) believing AI improves disease diagnosis, and 78.18\% (2866/3666) feeling that radiologists should embrace AI. Several associated factors, including age, gender, education, region, eye strain, working hours, time spent on medical images, resilience, burnout, AI experience, and perceptions of residency support and stress, significantly influence AI attitudes. For instance, burnout symptoms were associated with greater concerns about AI replacement (OR 1.89; P<.001), less favorable views on AI usefulness (OR 0.77; P=.005), and reduced willingness to use AI (OR 0.71; P<.001). Moreover, after adjusting for all other factors, perceived AI replacement (OR 0.81; P<.001) and AI usefulness (OR 5.97; P<.001) were shown to significantly impact the intention to use AI. Conclusions: This study profiles radiology residents who are accepting of AI. Our comprehensive findings provide insights for a multidimensional approach to help physicians adapt to AI. Targeted policies, such as digital health care initiatives and medical education, can be developed accordingly. ", doi="10.2196/48249", url="https://www.jmir.org/2023/1/e48249", url="http://www.ncbi.nlm.nih.gov/pubmed/37856181" } @Article{info:doi/10.2196/49385, author="Hu, Je-Ming and Liu, Feng-Cheng and Chu, Chi-Ming and Chang, Yu-Tien", title="Health Care Trainees' and Professionals' Perceptions of ChatGPT in Improving Medical Knowledge Training: Rapid Survey Study", journal="J Med Internet Res", year="2023", month="Oct", day="18", volume="25", pages="e49385", keywords="ChatGPT", keywords="large language model", keywords="medicine", keywords="perception evaluation", keywords="internet survey", keywords="structural equation modeling", keywords="SEM", abstract="Background: ChatGPT is a powerful pretrained large language model. It has both demonstrated potential and raised concerns related to knowledge translation and knowledge transfer. To apply and improve knowledge transfer in the real world, it is essential to assess the perceptions and acceptance of the users of ChatGPT-assisted training. Objective: We aimed to investigate the perceptions of health care trainees and professionals on ChatGPT-assisted training, using biomedical informatics as an example. Methods: We used purposeful sampling to include all health care undergraduate trainees and graduate professionals (n=195) from January to May 2023 in the School of Public Health at the National Defense Medical Center in Taiwan. Subjects were asked to watch a 2-minute video introducing 5 scenarios about ChatGPT-assisted training in biomedical informatics and then answer a self-designed online (web- and mobile-based) questionnaire according to the Kirkpatrick model. The survey responses were used to develop 4 constructs: ``perceived knowledge acquisition,'' ``perceived training motivation,'' ``perceived training satisfaction,'' and ``perceived training effectiveness.'' The study used structural equation modeling (SEM) to evaluate and test the structural model and hypotheses. Results: The online questionnaire response rate was 152 of 195 (78\%); 88 of 152 participants (58\%) were undergraduate trainees and 90 of 152 participants (59\%) were women. The ages ranged from 18 to 53 years (mean 23.3, SD 6.0 years). There was no statistical difference in perceptions of training evaluation between men and women. Most participants were enthusiastic about the ChatGPT-assisted training, while the graduate professionals were more enthusiastic than undergraduate trainees. Nevertheless, some concerns were raised about potential cheating on training assessment. The average scores for knowledge acquisition, training motivation, training satisfaction, and training effectiveness were 3.84 (SD 0.80), 3.76 (SD 0.93), 3.75 (SD 0.87), and 3.72 (SD 0.91), respectively (Likert scale 1-5: strongly disagree to strongly agree). Knowledge acquisition had the highest score and training effectiveness the lowest. In the SEM results, training effectiveness was influenced predominantly by knowledge acquisition and partially met the hypotheses in the research framework. Knowledge acquisition had a direct effect on training effectiveness, training satisfaction, and training motivation, with $\beta$ coefficients of .80, .87, and .97, respectively (all P<.001). Conclusions: Most health care trainees and professionals perceived ChatGPT-assisted training as an aid in knowledge transfer. However, to improve training effectiveness, it should be combined with empirical experts for proper guidance and dual interaction. In a future study, we recommend using a larger sample size for evaluation of internet-connected large language models in medical knowledge transfer. ", doi="10.2196/49385", url="https://www.jmir.org/2023/1/e49385", url="http://www.ncbi.nlm.nih.gov/pubmed/37851495" } @Article{info:doi/10.2196/47049, author="Khlaif, N. Zuheir and Mousa, Allam and Hattab, Kamal Muayad and Itmazi, Jamil and Hassan, A. Amjad and Sanmugam, Mageswaran and Ayyoub, Abedalkarim", title="The Potential and Concerns of Using AI in Scientific Research: ChatGPT Performance Evaluation", journal="JMIR Med Educ", year="2023", month="Sep", day="14", volume="9", pages="e47049", keywords="artificial intelligence", keywords="AI", keywords="ChatGPT", keywords="scientific research", keywords="research ethics", abstract="Background: Artificial intelligence (AI) has many applications in various aspects of our daily life, including health, criminal, education, civil, business, and liability law. One aspect of AI that has gained significant attention is natural language processing (NLP), which refers to the ability of computers to understand and generate human language. Objective: This study aims to examine the potential for, and concerns of, using AI in scientific research. For this purpose, high-impact research articles were generated by analyzing the quality of reports generated by ChatGPT and assessing the application's impact on the research framework, data analysis, and the literature review. The study also explored concerns around ownership and the integrity of research when using AI-generated text. Methods: A total of 4 articles were generated using ChatGPT, and thereafter evaluated by 23 reviewers. The researchers developed an evaluation form to assess the quality of the articles generated. Additionally, 50 abstracts were generated using ChatGPT and their quality was evaluated. The data were subjected to ANOVA and thematic analysis to analyze the qualitative data provided by the reviewers. Results: When using detailed prompts and providing the context of the study, ChatGPT would generate high-quality research that could be published in high-impact journals. However, ChatGPT had a minor impact on developing the research framework and data analysis. The primary area needing improvement was the development of the literature review. Moreover, reviewers expressed concerns around ownership and the integrity of the research when using AI-generated text. Nonetheless, ChatGPT has a strong potential to increase human productivity in research and can be used in academic writing. Conclusions: AI-generated text has the potential to improve the quality of high-impact research articles. The findings of this study suggest that decision makers and researchers should focus more on the methodology part of the research, which includes research design, developing research tools, and analyzing data in depth, to draw strong theoretical and practical implications, thereby establishing a revolution in scientific research in the era of AI. The practical implications of this study can be used in different fields such as medical education to deliver materials to develop the basic competencies for both medicine students and faculty members. ", doi="10.2196/47049", url="https://mededu.jmir.org/2023/1/e47049", url="http://www.ncbi.nlm.nih.gov/pubmed/37707884" } @Article{info:doi/10.2196/48254, author="Sallam, Malik and Salim, A. Nesreen and Barakat, Muna and Al-Mahzoum, Kholoud and Al-Tammemi, B. Ala'a and Malaeb, Diana and Hallit, Rabih and Hallit, Souheil", title="Assessing Health Students' Attitudes and Usage of ChatGPT in Jordan: Validation Study", journal="JMIR Med Educ", year="2023", month="Sep", day="5", volume="9", pages="e48254", keywords="artificial intelligence", keywords="machine learning", keywords="education", keywords="technology", keywords="healthcare", keywords="survey", keywords="opinion", keywords="knowledge", keywords="practices", keywords="KAP", abstract="Background: ChatGPT is a conversational large language model that has the potential to revolutionize knowledge acquisition. However, the impact of this technology on the quality of education is still unknown considering the risks and concerns surrounding ChatGPT use. Therefore, it is necessary to assess the usability and acceptability of this promising tool. As an innovative technology, the intention to use ChatGPT can be studied in the context of the technology acceptance model (TAM). Objective: This study aimed to develop and validate a TAM-based survey instrument called TAME-ChatGPT (Technology Acceptance Model Edited to Assess ChatGPT Adoption) that could be employed to examine the successful integration and use of ChatGPT in health care education. Methods: The survey tool was created based on the TAM framework. It comprised 13 items for participants who heard of ChatGPT but did not use it and 23 items for participants who used ChatGPT. Using a convenient sampling approach, the survey link was circulated electronically among university students between February and March 2023. Exploratory factor analysis (EFA) was used to assess the construct validity of the survey instrument. Results: The final sample comprised 458 respondents, the majority among them undergraduate students (n=442, 96.5\%). Only 109 (23.8\%) respondents had heard of ChatGPT prior to participation and only 55 (11.3\%) self-reported ChatGPT use before the study. EFA analysis on the attitude and usage scales showed significant Bartlett tests of sphericity scores (P<.001) and adequate Kaiser-Meyer-Olkin measures (0.823 for the attitude scale and 0.702 for the usage scale), confirming the factorability of the correlation matrices. The EFA showed that 3 constructs explained a cumulative total of 69.3\% variance in the attitude scale, and these subscales represented perceived risks, attitude to technology/social influence, and anxiety. For the ChatGPT usage scale, EFA showed that 4 constructs explained a cumulative total of 72\% variance in the data and comprised the perceived usefulness, perceived risks, perceived ease of use, and behavior/cognitive factors. All the ChatGPT attitude and usage subscales showed good reliability with Cronbach $\alpha$ values >.78 for all the deduced subscales. Conclusions: The TAME-ChatGPT demonstrated good reliability, validity, and usefulness in assessing health care students' attitudes toward ChatGPT. The findings highlighted the importance of considering risk perceptions, usefulness, ease of use, attitudes toward technology, and behavioral factors when adopting ChatGPT as a tool in health care education. This information can aid the stakeholders in creating strategies to support the optimal and ethical use of ChatGPT and to identify the potential challenges hindering its successful implementation. Future research is recommended to guide the effective adoption of ChatGPT in health care education. ", doi="10.2196/48254", url="https://mededu.jmir.org/2023/1/e48254", url="http://www.ncbi.nlm.nih.gov/pubmed/37578934" } @Article{info:doi/10.2196/46482, author="Roos, Jonas and Kasapovic, Adnan and Jansen, Tom and Kaczmarczyk, Robert", title="Artificial Intelligence in Medical Education: Comparative Analysis of ChatGPT, Bing, and Medical Students in Germany", journal="JMIR Med Educ", year="2023", month="Sep", day="4", volume="9", pages="e46482", keywords="medical education", keywords="state examinations", keywords="exams", keywords="large language models", keywords="artificial intelligence", keywords="ChatGPT", abstract="Background: Large language models (LLMs) have demonstrated significant potential in diverse domains, including medicine. Nonetheless, there is a scarcity of studies examining their performance in medical examinations, especially those conducted in languages other than English, and in direct comparison with medical students. Analyzing the performance of LLMs in state medical examinations can provide insights into their capabilities and limitations and evaluate their potential role in medical education and examination preparation.? Objective: This study aimed to assess and compare the performance of 3 LLMs, GPT-4, Bing, and GPT-3.5-Turbo, in the German Medical State Examinations of 2022 and to evaluate their performance relative to that of medical students.? Methods: The LLMs were assessed on a total of 630 questions from the spring and fall German Medical State Examinations of 2022. The performance was evaluated with and without media-related questions. Statistical analyses included 1-way ANOVA and independent samples t tests for pairwise comparisons. The relative strength of the LLMs in comparison with that of the students was also evaluated.? Results: GPT-4 achieved the highest overall performance, correctly answering 88.1\% of questions, closely followed by Bing (86.0\%) and GPT-3.5-Turbo (65.7\%). The students had an average correct answer rate of 74.6\%. Both GPT-4 and Bing significantly outperformed the students in both examinations. When media questions were excluded, Bing achieved the highest performance of 90.7\%, closely followed by GPT-4 (90.4\%), while GPT-3.5-Turbo lagged (68.2\%). There was a significant decline in the performance of GPT-4 and Bing in the fall 2022 examination, which was attributed to a higher proportion of media-related questions and a potential increase in question difficulty.? Conclusions: LLMs, particularly GPT-4 and Bing, demonstrate potential as valuable tools in medical education and for pretesting examination questions. Their high performance, even relative to that of medical students, indicates promising avenues for further development and integration into the educational and clinical landscape.? ", doi="10.2196/46482", url="https://mededu.jmir.org/2023/1/e46482", url="http://www.ncbi.nlm.nih.gov/pubmed/37665620" } @Article{info:doi/10.2196/51494, author="Leung, I. Tiffany and Sagar, Ankita and Shroff, Swati and Henry, L. Tracey", title="Can AI Mitigate Bias in Writing Letters of Recommendation?", journal="JMIR Med Educ", year="2023", month="Aug", day="23", volume="9", pages="e51494", keywords="sponsorship", keywords="implicit bias", keywords="gender bias", keywords="bias", keywords="letters of recommendation", keywords="artificial intelligence", keywords="large language models", keywords="medical education", keywords="career advancement", keywords="tenure and promotion", keywords="promotion", keywords="leadership", doi="10.2196/51494", url="https://mededu.jmir.org/2023/1/e51494", url="http://www.ncbi.nlm.nih.gov/pubmed/37610808" } @Article{info:doi/10.2196/50945, author="Safranek, W. Conrad and Sidamon-Eristoff, Elizabeth Anne and Gilson, Aidan and Chartash, David", title="The Role of Large Language Models in Medical Education: Applications and Implications", journal="JMIR Med Educ", year="2023", month="Aug", day="14", volume="9", pages="e50945", keywords="large language models", keywords="ChatGPT", keywords="medical education", keywords="LLM", keywords="artificial intelligence in health care", keywords="AI", keywords="autoethnography", doi="10.2196/50945", url="https://mededu.jmir.org/2023/1/e50945", url="http://www.ncbi.nlm.nih.gov/pubmed/37578830" } @Article{info:doi/10.2196/50336, author="Gilson, Aidan and Safranek, W. Conrad and Huang, Thomas and Socrates, Vimig and Chi, Ling and Taylor, Andrew Richard and Chartash, David", title="Authors' Reply to: Variability in Large Language Models' Responses to Medical Licensing and Certification Examinations", journal="JMIR Med Educ", year="2023", month="Jul", day="13", volume="9", pages="e50336", keywords="natural language processing", keywords="NLP", keywords="MedQA", keywords="generative pre-trained transformer", keywords="GPT", keywords="medical education", keywords="chatbot", keywords="artificial intelligence", keywords="AI", keywords="education technology", keywords="ChatGPT", keywords="conversational agent", keywords="machine learning", keywords="large language models", keywords="knowledge assessment", doi="10.2196/50336", url="https://mededu.jmir.org/2023/1/e50336", url="http://www.ncbi.nlm.nih.gov/pubmed/37440299" } @Article{info:doi/10.2196/48305, author="Epstein, H. Richard and Dexter, Franklin", title="Variability in Large Language Models' Responses to Medical Licensing and Certification Examinations. Comment on ``How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment''", journal="JMIR Med Educ", year="2023", month="Jul", day="13", volume="9", pages="e48305", keywords="natural language processing", keywords="NLP", keywords="MedQA", keywords="generative pre-trained transformer", keywords="GPT", keywords="medical education", keywords="chatbot", keywords="artificial intelligence", keywords="AI", keywords="education technology", keywords="ChatGPT", keywords="Google Bard", keywords="conversational agent", keywords="machine learning", keywords="large language models", keywords="knowledge assessment", doi="10.2196/48305", url="https://mededu.jmir.org/2023/1/e48305", url="http://www.ncbi.nlm.nih.gov/pubmed/37440293" } @Article{info:doi/10.2196/48291, author="Abd-alrazaq, Alaa and AlSaad, Rawan and Alhuwail, Dari and Ahmed, Arfan and Healy, Mark Padraig and Latifi, Syed and Aziz, Sarah and Damseh, Rafat and Alabed Alrazak, Sadam and Sheikh, Javaid", title="Large Language Models in Medical Education: Opportunities, Challenges, and Future Directions", journal="JMIR Med Educ", year="2023", month="Jun", day="1", volume="9", pages="e48291", keywords="large language models", keywords="artificial intelligence", keywords="medical education", keywords="ChatGPT", keywords="GPT-4", keywords="generative AI", keywords="students", keywords="educators", doi="10.2196/48291", url="https://mededu.jmir.org/2023/1/e48291", url="http://www.ncbi.nlm.nih.gov/pubmed/37261894" } @Article{info:doi/10.2196/46599, author="Thirunavukarasu, James Arun and Hassan, Refaat and Mahmood, Shathar and Sanghera, Rohan and Barzangi, Kara and El Mukashfi, Mohanned and Shah, Sachin", title="Trialling a Large Language Model (ChatGPT) in General Practice With the Applied Knowledge Test: Observational Study Demonstrating Opportunities and Limitations in Primary Care", journal="JMIR Med Educ", year="2023", month="Apr", day="21", volume="9", pages="e46599", keywords="ChatGPT", keywords="large language model", keywords="natural language processing", keywords="decision support techniques", keywords="artificial intelligence", keywords="AI", keywords="deep learning", keywords="primary care", keywords="general practice", keywords="family medicine", keywords="chatbot", abstract="Background: Large language models exhibiting human-level performance in specialized tasks are emerging; examples include Generative Pretrained Transformer 3.5, which underlies the processing of ChatGPT. Rigorous trials are required to understand the capabilities of emerging technology, so that innovation can be directed to benefit patients and practitioners. Objective: Here, we evaluated the strengths and weaknesses of ChatGPT in primary care using the Membership of the Royal College of General Practitioners Applied Knowledge Test (AKT) as a medium. Methods: AKT questions were sourced from a web-based question bank and 2 AKT practice papers. In total, 674 unique AKT questions were inputted to ChatGPT, with the model's answers recorded and compared to correct answers provided by the Royal College of General Practitioners. Each question was inputted twice in separate ChatGPT sessions, with answers on repeated trials compared to gauge consistency. Subject difficulty was gauged by referring to examiners' reports from 2018 to 2022. Novel explanations from ChatGPT---defined as information provided that was not inputted within the question or multiple answer choices---were recorded. Performance was analyzed with respect to subject, difficulty, question source, and novel model outputs to explore ChatGPT's strengths and weaknesses. Results: Average overall performance of ChatGPT was 60.17\%, which is below the mean passing mark in the last 2 years (70.42\%). Accuracy differed between sources (P=.04 and .06). ChatGPT's performance varied with subject category (P=.02 and .02), but variation did not correlate with difficulty (Spearman $\rho$=--0.241 and --0.238; P=.19 and .20). The proclivity of ChatGPT to provide novel explanations did not affect accuracy (P>.99 and .23). Conclusions: Large language models are approaching human expert--level performance, although further development is required to match the performance of qualified primary care physicians in the AKT. Validated high-performance models may serve as assistants or autonomous clinical tools to ameliorate the general practice workforce crisis. ", doi="10.2196/46599", url="https://mededu.jmir.org/2023/1/e46599", url="http://www.ncbi.nlm.nih.gov/pubmed/37083633" } @Article{info:doi/10.2196/46876, author="Sabry Abdel-Messih, Mary and Kamel Boulos, N. Maged", title="ChatGPT in Clinical Toxicology", journal="JMIR Med Educ", year="2023", month="Mar", day="8", volume="9", pages="e46876", keywords="ChatGPT", keywords="clinical toxicology", keywords="organophosphates", keywords="artificial intelligence", keywords="AI", keywords="medical education", doi="10.2196/46876", url="https://mededu.jmir.org/2023/1/e46876", url="http://www.ncbi.nlm.nih.gov/pubmed/36867743" } @Article{info:doi/10.2196/46885, author="Eysenbach, Gunther", title="The Role of ChatGPT, Generative Language Models, and Artificial Intelligence in Medical Education: A Conversation With ChatGPT and a Call for Papers", journal="JMIR Med Educ", year="2023", month="Mar", day="6", volume="9", pages="e46885", keywords="artificial intelligence", keywords="AI", keywords="ChatGPT", keywords="generative language model", keywords="medical education", keywords="interview", keywords="future of education", doi="10.2196/46885", url="https://mededu.jmir.org/2023/1/e46885", url="http://www.ncbi.nlm.nih.gov/pubmed/36863937" } @Article{info:doi/10.2196/45312, author="Gilson, Aidan and Safranek, W. Conrad and Huang, Thomas and Socrates, Vimig and Chi, Ling and Taylor, Andrew Richard and Chartash, David", title="How Does ChatGPT Perform on the United States Medical Licensing Examination (USMLE)? The Implications of Large Language Models for Medical Education and Knowledge Assessment", journal="JMIR Med Educ", year="2023", month="Feb", day="8", volume="9", pages="e45312", keywords="natural language processing", keywords="NLP", keywords="MedQA", keywords="generative pre-trained transformer", keywords="GPT", keywords="medical education", keywords="chatbot", keywords="artificial intelligence", keywords="education technology", keywords="ChatGPT", keywords="conversational agent", keywords="machine learning", keywords="USMLE", abstract="Background: Chat Generative Pre-trained Transformer (ChatGPT) is a 175-billion-parameter natural language processing model that can generate conversation-style responses to user input. Objective: This study aimed to evaluate the performance of ChatGPT on questions within the scope of the United States Medical Licensing Examination (USMLE) Step 1 and Step 2 exams, as well as to analyze responses for user interpretability. Methods: We used 2 sets of multiple-choice questions to evaluate ChatGPT's performance, each with questions pertaining to Step 1 and Step 2. The first set was derived from AMBOSS, a commonly used question bank for medical students, which also provides statistics on question difficulty and the performance on an exam relative to the user base. The second set was the National Board of Medical Examiners (NBME) free 120 questions. ChatGPT's performance was compared to 2 other large language models, GPT-3 and InstructGPT. The text output of each ChatGPT response was evaluated across 3 qualitative metrics: logical justification of the answer selected, presence of information internal to the question, and presence of information external to the question. Results: Of the 4 data sets, AMBOSS-Step1, AMBOSS-Step2, NBME-Free-Step1, and NBME-Free-Step2, ChatGPT achieved accuracies of 44\% (44/100), 42\% (42/100), 64.4\% (56/87), and 57.8\% (59/102), respectively. ChatGPT outperformed InstructGPT by 8.15\% on average across all data sets, and GPT-3 performed similarly to random chance. The model demonstrated a significant decrease in performance as question difficulty increased (P=.01) within the AMBOSS-Step1 data set. We found that logical justification for ChatGPT's answer selection was present in 100\% of outputs of the NBME data sets. Internal information to the question was present in 96.8\% (183/189) of all questions. The presence of information external to the question was 44.5\% and 27\% lower for incorrect answers relative to correct answers on the NBME-Free-Step1 (P<.001) and NBME-Free-Step2 (P=.001) data sets, respectively. Conclusions: ChatGPT marks a significant improvement in natural language processing models on the tasks of medical question answering. By performing at a greater than 60\% threshold on the NBME-Free-Step-1 data set, we show that the model achieves the equivalent of a passing score for a third-year medical student. Additionally, we highlight ChatGPT's capacity to provide logic and informational context across the majority of answers. These facts taken together make a compelling case for the potential applications of ChatGPT as an interactive medical education tool to support learning. ", doi="10.2196/45312", url="https://mededu.jmir.org/2023/1/e45312", url="http://www.ncbi.nlm.nih.gov/pubmed/36753318" } @Article{info:doi/10.2196/35587, author="Grunhut, Joel and Marques, Oge and Wyatt, M. Adam T.", title="Needs, Challenges, and Applications of Artificial Intelligence in Medical Education Curriculum", journal="JMIR Med Educ", year="2022", month="Jun", day="7", volume="8", number="2", pages="e35587", keywords="artificial intelligence", keywords="AI", keywords="medical education", keywords="medical student", doi="10.2196/35587", url="https://mededu.jmir.org/2022/2/e35587", url="http://www.ncbi.nlm.nih.gov/pubmed/35671077" } @Article{info:doi/10.2196/35223, author="Gray, Kathleen and Slavotinek, John and Dimaguila, Luis Gerardo and Choo, Dawn", title="Artificial Intelligence Education for the Health Workforce: Expert Survey of Approaches and Needs", journal="JMIR Med Educ", year="2022", month="Apr", day="4", volume="8", number="2", pages="e35223", keywords="artificial intelligence", keywords="curriculum", keywords="ethics", keywords="human-computer interaction", keywords="interprofessional education", keywords="machine learning", keywords="natural language processing", keywords="professional development", keywords="robotics", abstract="Background: The preparation of the current and future health workforce for the possibility of using artificial intelligence (AI) in health care is a growing concern as AI applications emerge in various care settings and specializations. At present, there is no obvious consensus among educators about what needs to be learned or how this learning may be supported or assessed. Objective: Our study aims to explore health care education experts' ideas and plans for preparing the health workforce to work with AI and identify critical gaps in curriculum and educational resources across a national health care system. Methods: A survey canvassed expert views on AI education for the health workforce in terms of educational strategies, subject matter priorities, meaningful learning activities, desired attitudes, and skills. A total of 39 senior people from different health workforce subgroups across Australia provided ratings and free-text responses in late 2020. Results: The responses highlighted the importance of education on ethical implications, suitability of large data sets for use in AI clinical applications, principles of machine learning, and specific diagnosis and treatment applications of AI as well as alterations to cognitive load during clinical work and the interaction between humans and machines in clinical settings. Respondents also outlined barriers to implementation, such as lack of governance structures and processes, resource constraints, and cultural adjustment. Conclusions: Further work around the world of the kind reported in this survey can assist educators and education authorities who are responsible for preparing the health workforce to minimize the risks and realize the benefits of implementing AI in health care. ", doi="10.2196/35223", url="https://mededu.jmir.org/2022/2/e35223", url="http://www.ncbi.nlm.nih.gov/pubmed/35249885" } @Article{info:doi/10.2196/33390, author="Teng, Minnie and Singla, Rohit and Yau, Olivia and Lamoureux, Daniel and Gupta, Aurinjoy and Hu, Zoe and Hu, Ricky and Aissiou, Amira and Eaton, Shane and Hamm, Camille and Hu, Sophie and Kelly, Dayton and MacMillan, M. Kathleen and Malik, Shamir and Mazzoli, Vienna and Teng, Yu-Wen and Laricheva, Maria and Jarus, Tal and Field, S. Thalia", title="Health Care Students' Perspectives on Artificial Intelligence: Countrywide Survey in Canada", journal="JMIR Med Educ", year="2022", month="Jan", day="31", volume="8", number="1", pages="e33390", keywords="medical education", keywords="artificial intelligence", keywords="allied health education", keywords="medical students", keywords="health care students", keywords="medical curriculum", keywords="education", abstract="Background: Artificial intelligence (AI) is no longer a futuristic concept; it is increasingly being integrated into health care. As studies on attitudes toward AI have primarily focused on physicians, there is a need to assess the perspectives of students across health care disciplines to inform future curriculum development. Objective: This study aims to explore and identify gaps in the knowledge that Canadian health care students have regarding AI, capture how health care students in different fields differ in their knowledge and perspectives on AI, and present student-identified ways that AI literacy may be incorporated into the health care curriculum. Methods: The survey was developed from a narrative literature review of topics in attitudinal surveys on AI. The final survey comprised 15 items, including multiple-choice questions, pick-group-rank questions, 11-point Likert scale items, slider scale questions, and narrative questions. We used snowball and convenience sampling methods by distributing an email with a description and a link to the web-based survey to representatives from 18 Canadian schools. Results: A total of 2167 students across 10 different health professions from 18 universities across Canada responded to the survey. Overall, 78.77\% (1707/2167) predicted that AI technology would affect their careers within the coming decade and 74.5\% (1595/2167) reported a positive outlook toward the emerging role of AI in their respective fields. Attitudes toward AI varied by discipline. Students, even those opposed to AI, identified the need to incorporate a basic understanding of AI into their curricula. Conclusions: We performed a nationwide survey of health care students across 10 different health professions in Canada. The findings would inform student-identified topics within AI and their preferred delivery formats, which would advance education across different health care professions. ", doi="10.2196/33390", url="https://mededu.jmir.org/2022/1/e33390", url="http://www.ncbi.nlm.nih.gov/pubmed/35099397" } @Article{info:doi/10.2196/31043, author="Charow, Rebecca and Jeyakumar, Tharshini and Younus, Sarah and Dolatabadi, Elham and Salhia, Mohammad and Al-Mouaswas, Dalia and Anderson, Melanie and Balakumar, Sarmini and Clare, Megan and Dhalla, Azra and Gillan, Caitlin and Haghzare, Shabnam and Jackson, Ethan and Lalani, Nadim and Mattson, Jane and Peteanu, Wanda and Tripp, Tim and Waldorf, Jacqueline and Williams, Spencer and Tavares, Walter and Wiljer, David", title="Artificial Intelligence Education Programs for Health Care Professionals: Scoping Review", journal="JMIR Med Educ", year="2021", month="Dec", day="13", volume="7", number="4", pages="e31043", keywords="machine learning", keywords="deep learning", keywords="health care providers", keywords="education", keywords="learning", keywords="patient care", abstract="Background: As the adoption of artificial intelligence (AI) in health care increases, it will become increasingly crucial to involve health care professionals (HCPs) in developing, validating, and implementing AI-enabled technologies. However, because of a lack of AI literacy, most HCPs are not adequately prepared for this revolution. This is a significant barrier to adopting and implementing AI that will affect patients. In addition, the limited existing AI education programs face barriers to development and implementation at various levels of medical education. Objective: With a view to informing future AI education programs for HCPs, this scoping review aims to provide an overview of the types of current or past AI education programs that pertains to the programs' curricular content, modes of delivery, critical implementation factors for education delivery, and outcomes used to assess the programs' effectiveness. Methods: After the creation of a search strategy and keyword searches, a 2-stage screening process was conducted by 2 independent reviewers to determine study eligibility. When consensus was not reached, the conflict was resolved by consulting a third reviewer. This process consisted of a title and abstract scan and a full-text review. The articles were included if they discussed an actual training program or educational intervention, or a potential training program or educational intervention and the desired content to be covered, focused on AI, and were designed or intended for HCPs (at any stage of their career). Results: Of the 10,094 unique citations scanned, 41 (0.41\%) studies relevant to our eligibility criteria were identified. Among the 41 included studies, 10 (24\%) described 13 unique programs and 31 (76\%) discussed recommended curricular content. The curricular content of the unique programs ranged from AI use, AI interpretation, and cultivating skills to explain results derived from AI algorithms. The curricular topics were categorized into three main domains: cognitive, psychomotor, and affective. Conclusions: This review provides an overview of the current landscape of AI in medical education and highlights the skills and competencies required by HCPs to effectively use AI in enhancing the quality of care and optimizing patient outcomes. Future education efforts should focus on the development of regulatory strategies, a multidisciplinary approach to curriculum redesign, a competency-based curriculum, and patient-clinician interaction. ", doi="10.2196/31043", url="https://mededu.jmir.org/2021/4/e31043", url="http://www.ncbi.nlm.nih.gov/pubmed/34898458" } @Article{info:doi/10.2196/19285, author="Sapci, Hasan A. and Sapci, Aylin H.", title="Artificial Intelligence Education and Tools for Medical and Health Informatics Students: Systematic Review", journal="JMIR Med Educ", year="2020", month="Jun", day="30", volume="6", number="1", pages="e19285", keywords="artificial intelligence", keywords="education", keywords="machine learning", keywords="deep learning", keywords="medical education", keywords="health informatics", keywords="systematic review", abstract="Background: The use of artificial intelligence (AI) in medicine will generate numerous application possibilities to improve patient care, provide real-time data analytics, and enable continuous patient monitoring. Clinicians and health informaticians should become familiar with machine learning and deep learning. Additionally, they should have a strong background in data analytics and data visualization to use, evaluate, and develop AI applications in clinical practice. Objective: The main objective of this study was to evaluate the current state of AI training and the use of AI tools to enhance the learning experience. Methods: A comprehensive systematic review was conducted to analyze the use of AI in medical and health informatics education, and to evaluate existing AI training practices. PRISMA-P (Preferred Reporting Items for Systematic Reviews and Meta-Analysis Protocols) guidelines were followed. The studies that focused on the use of AI tools to enhance medical education and the studies that investigated teaching AI as a new competency were categorized separately to evaluate recent developments. Results: This systematic review revealed that recent publications recommend the integration of AI training into medical and health informatics curricula. Conclusions: To the best of our knowledge, this is the first systematic review exploring the current state of AI education in both medicine and health informatics. Since AI curricula have not been standardized and competencies have not been determined, a framework for specialized AI training in medical and health informatics education is proposed. ", doi="10.2196/19285", url="http://mededu.jmir.org/2020/1/e19285/", url="http://www.ncbi.nlm.nih.gov/pubmed/32602844" } @Article{info:doi/10.2196/16048, author="Paranjape, Ketan and Schinkel, Michiel and Nannan Panday, Rishi and Car, Josip and Nanayakkara, Prabath", title="Introducing Artificial Intelligence Training in Medical Education", journal="JMIR Med Educ", year="2019", month="Dec", day="3", volume="5", number="2", pages="e16048", keywords="algorithm", keywords="artificial intelligence", keywords="black box", keywords="deep learning", keywords="machine learning", keywords="medical education", keywords="continuing education", keywords="data sciences", keywords="curriculum", doi="10.2196/16048", url="http://mededu.jmir.org/2019/2/e16048/", url="http://www.ncbi.nlm.nih.gov/pubmed/31793895" } @Article{info:doi/10.2196/13930, author="Chan, Siang Kai and Zary, Nabil", title="Applications and Challenges of Implementing Artificial Intelligence in Medical Education: Integrative Review", journal="JMIR Med Educ", year="2019", month="Jun", day="15", volume="5", number="1", pages="e13930", keywords="medical education", keywords="evaluation of AIED systems", keywords="real world applications of AIED systems", keywords="artificial intelligence", abstract="Background: Since the advent of artificial intelligence (AI) in 1955, the applications of AI have increased over the years within a rapidly changing digital landscape where public expectations are on the rise, fed by social media, industry leaders, and medical practitioners. However, there has been little interest in AI in medical education until the last two decades, with only a recent increase in the number of publications and citations in the field. To our knowledge, thus far, a limited number of articles have discussed or reviewed the current use of AI in medical education. Objective: This study aims to review the current applications of AI in medical education as well as the challenges of implementing AI in medical education. Methods: Medline (Ovid), EBSCOhost Education Resources Information Center (ERIC) and Education Source, and Web of Science were searched with explicit inclusion and exclusion criteria. Full text of the selected articles was analyzed using the Extension of Technology Acceptance Model and the Diffusions of Innovations theory. Data were subsequently pooled together and analyzed quantitatively. Results: A total of 37 articles were identified. Three primary uses of AI in medical education were identified: learning support (n=32), assessment of students' learning (n=4), and curriculum review (n=1). The main reasons for use of AI are its ability to provide feedback and a guided learning pathway and to decrease costs. Subgroup analysis revealed that medical undergraduates are the primary target audience for AI use. In addition, 34 articles described the challenges of AI implementation in medical education; two main reasons were identified: difficulty in assessing the effectiveness of AI in medical education and technical challenges while developing AI applications. Conclusions: The primary use of AI in medical education was for learning support mainly due to its ability to provide individualized feedback. Little emphasis was placed on curriculum review and assessment of students' learning due to the lack of digitalization and sensitive nature of examinations, respectively. Big data manipulation also warrants the need to ensure data integrity. Methodological improvements are required to increase AI adoption by addressing the technical difficulties of creating an AI application and using novel methods to assess the effectiveness of AI. To better integrate AI into the medical profession, measures should be taken to introduce AI into the medical school curriculum for medical professionals to better understand AI algorithms and maximize its use. ", doi="10.2196/13930", url="http://mededu.jmir.org/2019/1/e13930/", url="http://www.ncbi.nlm.nih.gov/pubmed/31199295" }