@Article{info:doi/10.2196/73248,
author="Joranger, P{\aa}l
and Rivenes Lafontan, Sara
and Brevik, Asgeir",
title="Evaluating a Large Language Model's Ability to Synthesize a Health Science Master's Thesis: Case Study",
journal="JMIR Form Res",
year="2025",
month="Jul",
day="3",
volume="9",
pages="e73248",
keywords="master's thesis",
keywords="large language model",
keywords="LLM",
keywords="ChatGPT",
keywords="health science",
keywords="qualitative",
keywords="quantitative",
abstract="Background: Large language models (LLMs) can aid students in mastering a new topic fast, but for the educational institutions responsible for assessing and grading the academic level of students, it can be difficult to discern whether a text has originated from a student's own cognition or has been synthesized by an LLM. Universities have traditionally relied on a submitted written thesis as proof of higher-level learning, on which to grant grades and diplomas. But what happens when LLMs are able to mimic the academic writing of subject matter experts? This is now a real dilemma. The ubiquitous availability of LLMs challenges trust in the master's thesis as evidence of subject matter comprehension and academic competencies. Objective: In this study, we aimed to assess the quality of rapid machine-generated papers against the standards of the health science master's program we are currently affiliated with. Methods: In an exploratory case study, we used ChatGPT (OpenAI) to generate 2 research papers as conceivable student submissions for master's thesis graduation from a health science master's program. One paper simulated a qualitative health science research project and another simulated a quantitative health science research project. Results: Using a stepwise approach, we prompted ChatGPT to (1) synthesize 2 credible datasets, and (2) generate 2 papers, that---in our judgment---would have been able to pass as credible medium-quality graduation research papers at the health science master's program the authors are currently affiliated with. It took 2.5 hours of iterative dialogue with ChatGPT to develop the qualitative paper and 3.5 hours to develop the quantitative paper. Making the synthetic datasets that served as a starting point for our ChatGPT-driven paper development took 1.5 and 16 hours for the qualitative and quantitative datasets, respectively. This included learning and prompt optimization, and for the quantitative dataset, it included the time it took to create tables, estimate relevant bivariate correlation coefficients, and prepare these coefficients to be read by ChatGPT. Conclusions: Our demonstration highlights the ease with which an LLM can synthesize research data, conduct scientific analyses, and produce credible research papers required for graduation from a master's program. A clear and well-written master's thesis, citing subject matter authorities and true to the expectations for academic writing, can no longer be regarded as solid proof of either extensive study or subject matter mastery. To uphold the integrity of academic standards and the value of university diplomas, we recommend that master's programs prioritize oral examinations and school exams. This shift is now crucial to ensure a fair and rigorous assessment of higher-order learning and abilities at the master's level. ",
doi="10.2196/73248",
url="https://formative.jmir.org/2025/1/e73248"
}


@Article{info:doi/10.2196/69485,
author="Yao, Zhong
and Duan, Liantan
and Xu, Shuo
and Chi, Lingyi
and Sheng, Dongfang",
title="Performance of Large Language Models in the Non-English Context: Qualitative Study of Models Trained on Different Languages in Chinese Medical Examinations",
journal="JMIR Med Inform",
year="2025",
month="Jun",
day="27",
volume="13",
pages="e69485",
keywords="large language model",
keywords="medical examination",
keywords="non-English",
keywords="ChatGPT",
keywords="language corpora",
abstract="Background: Research on large language models (LLMs) in the medical field has predominantly focused on models trained with English-language corpora, evaluating their performance within English-speaking contexts. The performances of models trained with non--English language corpora and their performance in non-English contexts remain underexplored. Objective: This study aimed to evaluate the performances of LLMs trained on different languages corpora by using the Chinese National Medical Licensing Examination (CNMLE) as a benchmark and constructed analogous questions. Methods: Under different prompt settings, we sequentially posed questions to 7 LLMs: 2 primarily trained on English-language corpora and 5 primarily on Chinese-language corpora. The models' responses were compared against standard answers to calculate the accuracy rate of each model. Further subgroup analyses were conducted by categorizing the questions based on various criteria. We also collected error sets to explore patterns of mistakes across different models. Results: Under the zero-shot setting, 6 out of 7 models exceeded the passing level, with the highest accuracy rate achieved by the Chinese LLM Baichuan (86.67\%), followed by ChatGPT (83.83\%). In the constructed questions, all 7 models exceeded the passing threshold, with Baichuan maintaining the highest accuracy rate (87.00\%). In few-shot learning, all models exceeded the passing threshold. Baichuan, ChatGLM, and ChatGPT retained the highest accuracy. While Llama showed marked improvement over previous tests, the relative performance rankings of other models stayed similar to previous results. In subgroup analyses, English models demonstrated comparable or superior performance to Chinese models on questions related to ethics and policy. All models except Llama generally had higher accuracy rates for simple questions than for complex ones. The error set of ChatGPT was similar to those of other Chinese models. Multimodel cross-verification outperformed single model, particularly improving accuracy rate on simple questions. The implementation of dual-model and tri-model verification achieved accuracy rates of 94.17\% and 96.33\% respectively. Conclusions: At the current level, LLMs trained primarily on English corpora and those trained mainly on Chinese corpora perform similarly well in CNMLE, with Chinese models still outperforming. The performance difference between ChatGPT and other Chinese LLMs are not solely due to communication barriers but are more likely influenced by disparities in the training data. By using a method of cross-verification with multiple LLMs, excellent performance can be achieved in medical examinations. ",
doi="10.2196/69485",
url="https://medinform.jmir.org/2025/1/e69485"
}


@Article{info:doi/10.2196/63353,
author="Cross, Joseph
and Kayalackakom, Tarron
and Robinson, E. Raymond
and Vaughans, Andrea
and Sebastian, Roopa
and Hood, Ricardo
and Lewis, Courtney
and Devaraju, Sumanth
and Honnavar, Prasanna
and Naik, Sheetal
and Joseph, Jillwin
and Anand, Nikhilesh
and Mohammed, Abdalla
and Johnson, Asjah
and Cohen, Eliran
and Adeniji, Teniola
and Nnenna Nnaji, Aisling
and George, Elizabeth Julia",
title="Assessing ChatGPT's Capability as a New Age Standardized Patient: Qualitative Study",
journal="JMIR Med Educ",
year="2025",
month="May",
day="20",
volume="11",
pages="e63353",
keywords="medical education",
keywords="standardized patient",
keywords="AI",
keywords="ChatGPT",
keywords="virtual patient",
keywords="assessment",
keywords="standardized patients",
keywords="LLM",
keywords="effectiveness",
keywords="medical school",
keywords="qualitative",
keywords="flexibility",
keywords="diagnostic",
abstract="Background: Standardized patients (SPs) have been crucial in medical education, offering realistic patient interactions to students. Despite their benefits, SP training is resource-intensive and access can be limited. Advances in artificial intelligence (AI), particularly with large language models such as ChatGPT, present new opportunities for virtual SPs, potentially addressing these limitations. Objectives: This study aims to assess medical students' perceptions and experiences of using ChatGPT as an SP and to evaluate ChatGPT's effectiveness in performing as a virtual SP in a medical school setting. Methods: This qualitative study, approved by the American University of Antigua Institutional Review Board, involved 9 students (5 females and 4 males, aged 22?48 years) from the American University of Antigua College of Medicine. Students were observed during a live role-play, interacting with ChatGPT as an SP using a predetermined prompt. A structured 15-question survey was administered before and after the interaction. Thematic analysis was conducted on the transcribed and coded responses, with inductive category formation. Results: Thematic analysis identified key themes preinteraction including technology limitations (eg, prompt engineering difficulties), learning efficacy (eg, potential for personalized learning and reduced interview stress), verisimilitude (eg, absence of visual cues), and trust (eg, concerns about AI accuracy). Postinteraction, students noted improvements in prompt engineering, some alignment issues (eg, limited responses on sensitive topics), maintained learning efficacy (eg, convenience and repetition), and continued verisimilitude challenges (eg, lack of empathy and nonverbal cues). No significant trust issues were reported postinteraction. Despite some limitations, students found ChatGPT as a valuable supplement to traditional SPs, enhancing practice flexibility and diagnostic skills. Conclusions: ChatGPT can effectively augment traditional SPs in medical education, offering accessible, flexible practice opportunities. However, it cannot fully replace human SPs due to limitations in verisimilitude and prompt engineering challenges. Integrating prompt engineering into medical curricula and continuous advancements in AI are recommended to enhance the use of virtual SPs. ",
doi="10.2196/63353",
url="https://mededu.jmir.org/2025/1/e63353"
}


@Article{info:doi/10.2196/58801,
author="Ozkan, Ecem
and Tekin, Aysun
and Ozkan, Can Mahmut
and Cabrera, Daniel
and Niven, Alexander
and Dong, Yue",
title="Global Health care Professionals' Perceptions of Large Language Model Use In Practice: Cross-Sectional Survey Study",
journal="JMIR Med Educ",
year="2025",
month="May",
day="12",
volume="11",
pages="e58801",
keywords="ChatGPT",
keywords="LLM",
keywords="global",
keywords="health care professionals",
keywords="large language model",
keywords="language model",
keywords="chatbot",
keywords="AI",
keywords="diagnostic accuracy",
keywords="efficiency",
keywords="treatment planning",
keywords="patient outcome",
keywords="patient care",
keywords="survey",
keywords="physicians",
keywords="nurses",
keywords="educators",
keywords="patient communication",
keywords="clinical",
keywords="educational",
keywords="utilization",
keywords="artificial intelligence",
abstract="Background: ChatGPT is a large language model-based chatbot developed by OpenAI. ChatGPT has many potential applications to health care, including enhanced diagnostic accuracy and efficiency, improved treatment planning, and better patient outcomes. However, health care professionals' perceptions of ChatGPT and similar artificial intelligence tools are not well known. Understanding these attitudes is important to inform the best approaches to exploring their use in medicine. Objective: Our aim was to evaluate the health care professionals' awareness and perceptions regarding potential applications of ChatGPT in the medical field, including potential benefits and challenges of adoption. Methods: We designed a 33-question online survey that was distributed among health care professionals via targeted emails and professional Twitter and LinkedIn accounts. The survey included a range of questions to define respondents' demographic characteristics, familiarity with ChatGPT, perceptions of this tool's usefulness and reliability, and opinions on its potential to improve patient care, research, and education efforts. Results: One hundred and fifteen health care professionals from 21 countries responded to the survey, including physicians, nurses, researchers, and educators. Of these, 101 (87.8\%) had heard of ChatGPT, mainly from peers, social media, and news, and 77 (76.2\%) had used ChatGPT at least once. Participants found ChatGPT to be helpful for writing manuscripts (n=31, 45.6\%), emails (n=25, 36.8\%), and grants (n=12, 17.6\%); accessing the latest research and evidence-based guidelines (n=21, 30.9\%); providing suggestions on diagnosis or treatment (n=15, 22.1\%); and improving patient communication (n=12, 17.6\%). Respondents also felt that the ability of ChatGPT to access and summarize research articles (n=22, 46.8\%), provide quick answers to clinical questions (n=15, 31.9\%), and generate patient education materials (n=10, 21.3\%) was helpful. However, there are concerns regarding the use of ChatGPT, for example, the accuracy of responses (n=14, 29.8\%), limited applicability in specific practices (n=18, 38.3\%), and legal and ethical considerations (n=6, 12.8\%), mainly related to plagiarism or copyright violations. Participants stated that safety protocols such as data encryption (n=63, 62.4\%) and access control (n=52, 51.5\%) could assist in ensuring patient privacy and data security. Conclusions: Our findings show that ChatGPT use is widespread among health care professionals in daily clinical, research, and educational activities. The majority of our participants found ChatGPT to be useful; however, there are concerns about patient privacy, data security, and its legal and ethical issues as well as the accuracy of its information. Further studies are required to understand the impact of ChatGPT and other large language models on clinical, educational, and research outcomes, and the concerns regarding its use must be addressed systematically and through appropriate methods. ",
doi="10.2196/58801",
url="https://mededu.jmir.org/2025/1/e58801"
}


@Article{info:doi/10.2196/68527,
author="Yano, Yuichiro
and Ohashi, Mizuki
and Miyagami, Taiju
and Mori, Hirotake
and Nishizaki, Yuji
and Daida, Hiroyuki
and Naito, Toshio",
title="The Advanced Reasoning Capabilities of Large Language Models for Detecting Contraindicated Options in Medical Exams",
journal="JMIR Med Inform",
year="2025",
month="May",
day="12",
volume="13",
pages="e68527",
keywords="natural language processing",
keywords="artificial intelligence",
keywords="clinical reasoning",
keywords="medical errors",
keywords="large language model",
doi="10.2196/68527",
url="https://medinform.jmir.org/2025/1/e68527"
}


@Article{info:doi/10.2196/67926,
author="Elabd, Noor
and Rahman, Muhammad Zafirah
and Abu Alinnin, Ibrahim Salma
and Jahan, Samiyah
and Campos, Aparecida Luciana
and Baltatu, Constantin Ovidiu",
title="Designing Personalized Multimodal Mnemonics With AI: A Medical Student's Implementation Tutorial",
journal="JMIR Med Educ",
year="2025",
month="May",
day="8",
volume="11",
pages="e67926",
keywords="medical education",
keywords="personalized learning",
keywords="prompt engineering",
keywords="multimodal learning",
keywords="memory techniques",
keywords="dual-coding theory",
keywords="student-centered approach",
keywords="student-centered",
keywords="large language model",
keywords="natural language processing",
keywords="NLP",
keywords="machine learning",
keywords="AI",
keywords="ChatGPT",
keywords="medical student",
keywords="digital literacy",
keywords="health care professional",
abstract="Background: Medical education can be challenging for students as they must manage vast amounts of complex information. Traditional mnemonic resources often follow a standardized approach, which may not accommodate diverse learning styles. Objective: This tutorial presents a student-developed approach to creating personalized multimodal mnemonics (PMMs) using artifical intelligence tools. Methods: This tutorial demonstrates a structured implementation process using ChatGPT (GPT-4 model) for text mnemonic generation and DALL-E 3 for visual mnemonic creation. We detail the prompt engineering framework, including zero-shot, few-shot, and chain-of-thought prompting techniques. The process involves (1) template development, (2) refinement, (3) personalization, (4) mnemonic specification, and (5) quality control. The implementation time typically ranges from 2 to 5 minutes per concept, with 1 to 3 iterations needed for optimal results. Results: Through systematic testing across 6 medical concepts, the implementation process achieved an initial success rate of 85\%, improving to 95\% after refinement. Key challenges included maintaining medical accuracy (addressed through specific terminology in prompts), ensuring visual clarity (improved through anatomical detail specifications), and achieving integration of text and visuals (resolved through structured review protocols). This tutorial provides practical templates, troubleshooting strategies, and quality control measures to address common implementation challenges. Conclusions: This tutorial offers medical students a practical framework for creating personalized learning tools using artificial intelligence. By following the detailed prompt engineering process and quality control measures, students can efficiently generate customized mnemonics while avoiding common pitfalls. The approach emphasizes human oversight and iterative refinement to ensure medical accuracy and educational value. The elimination of the need for developing separate databases of mnemonics streamlines the learning process. ",
doi="10.2196/67926",
url="https://mededu.jmir.org/2025/1/e67926"
}


@Article{info:doi/10.2196/70420,
author="Quon, Stephanie
and Zhou, Sarah",
title="Enhancing AI-Driven Medical Translations: Considerations for Language Concordance",
journal="JMIR Med Educ",
year="2025",
month="Apr",
day="11",
volume="11",
pages="e70420",
keywords="letter to the editor",
keywords="ChatGPT",
keywords="AI",
keywords="artificial intelligence",
keywords="language",
keywords="translation",
keywords="health care disparity",
keywords="natural language model",
keywords="survey",
keywords="patient education",
keywords="accessibility",
keywords="preference",
keywords="human language",
keywords="communication",
keywords="language-concordant care",
doi="10.2196/70420",
url="https://mededu.jmir.org/2025/1/e70420"
}


@Article{info:doi/10.2196/71721,
author="Teng, Joyce
and Novoa, Andres Roberto
and Aleshin, Alexandrovna Maria
and Lester, Jenna
and Seiger, Kira
and Dzuali, Fiatsogbe
and Daneshjou, Roxana",
title="Authors' Reply: Enhancing AI-Driven Medical Translations: Considerations for Language Concordance",
journal="JMIR Med Educ",
year="2025",
month="Apr",
day="11",
volume="11",
pages="e71721",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="language",
keywords="translation",
keywords="health care disparity",
keywords="natural language model",
keywords="survey",
keywords="patient education",
keywords="accessibility",
keywords="preference",
keywords="human language",
keywords="communication",
keywords="language-concordant care",
doi="10.2196/71721",
url="https://mededu.jmir.org/2025/1/e71721"
}


@Article{info:doi/10.2196/67244,
author="Bolgova, Olena
and Shypilova, Inna
and Mavrych, Volodymyr",
title="Large Language Models in Biochemistry Education: Comparative Evaluation of Performance",
journal="JMIR Med Educ",
year="2025",
month="Apr",
day="10",
volume="11",
pages="e67244",
keywords="ChatGPT",
keywords="Claude",
keywords="Gemini",
keywords="Copilot",
keywords="biochemistry",
keywords="LLM",
keywords="medical education",
keywords="artificial intelligence",
keywords="NLP",
keywords="natural language processing",
keywords="machine learning",
keywords="large language model",
keywords="AI",
keywords="ML",
keywords="comprehensive analysis",
keywords="medical students",
keywords="GPT-4",
keywords="questionnaire",
keywords="medical course",
keywords="bioenergetics",
abstract="Background: Recent advancements in artificial intelligence (AI), particularly in large language models (LLMs), have started a new era of innovation across various fields, with medicine at the forefront of this technological revolution. Many studies indicated that at the current level of development, LLMs can pass different board exams. However, the ability to answer specific subject-related questions requires validation. Objective: The objective of this study was to conduct a comprehensive analysis comparing the performance of advanced LLM chatbots---Claude (Anthropic), GPT-4 (OpenAI), Gemini (Google), and Copilot (Microsoft)---against the academic results of medical students in the medical biochemistry course. Methods: We used 200 USMLE (United States Medical Licensing Examination)--style multiple-choice questions (MCQs) selected from the course exam database. They encompassed various complexity levels and were distributed across 23 distinctive topics. The questions with tables and images were not included in the study. The results of 5 successive attempts by Claude 3.5 Sonnet, GPT-4?1106, Gemini 1.5 Flash, and Copilot to answer this questionnaire set were evaluated based on accuracy in August 2024. Statistica 13.5.0.17 (TIBCO Software Inc) was used to analyze the data's basic statistics. Considering the binary nature of the data, the chi-square test was used to compare results among the different chatbots, with a statistical significance level of P<.05. Results: On average, the selected chatbots correctly answered 81.1\% (SD 12.8\%) of the questions, surpassing the students' performance by 8.3\% (P=.02). In this study, Claude showed the best performance in biochemistry MCQs, correctly answering 92.5\% (185/200) of questions, followed by GPT-4 (170/200, 85\%), Gemini (157/200, 78.5\%), and Copilot (128/200, 64\%). The chatbots demonstrated the best results in the following 4 topics: eicosanoids (mean 100\%, SD 0\%), bioenergetics and electron transport chain (mean 96.4\%, SD 7.2\%), hexose monophosphate pathway (mean 91.7\%, SD 16.7\%), and ketone bodies (mean 93.8\%, SD 12.5\%). The Pearson chi-square test indicated a statistically significant association between the answers of all 4 chatbots (P<.001 to P<.04). Conclusions: Our study suggests that different AI models may have unique strengths in specific medical fields, which could be leveraged for targeted support in biochemistry courses. This performance highlights the potential of AI in medical education and assessment. ",
doi="10.2196/67244",
url="https://mededu.jmir.org/2025/1/e67244"
}


@Article{info:doi/10.2196/67883,
author="Wei, Bin
and Yao, Lili
and Hu, Xin
and Hu, Yuxiang
and Rao, Jie
and Ji, Yu
and Dong, Zhuoer
and Duan, Yichong
and Wu, Xiaorong",
title="Evaluating the Effectiveness of Large Language Models in Providing Patient Education for Chinese Patients With Ocular Myasthenia Gravis: Mixed Methods Study",
journal="J Med Internet Res",
year="2025",
month="Apr",
day="10",
volume="27",
pages="e67883",
keywords="LLM",
keywords="large language models",
keywords="ocular myasthenia gravis",
keywords="patient education",
keywords="China",
keywords="effectiveness",
keywords="deep learning",
keywords="artificial intelligence",
keywords="health care",
keywords="accuracy",
keywords="applicability",
keywords="neuromuscular disorder",
keywords="extraocular muscles",
keywords="ptosis",
keywords="diplopia",
keywords="ophthalmology",
keywords="ChatGPT",
keywords="clinical practice",
keywords="digital health",
abstract="Background: Ocular myasthenia gravis (OMG) is a neuromuscular disorder primarily affecting the extraocular muscles, leading to ptosis and diplopia. Effective patient education is crucial for disease management; however, in China, limited health care resources often restrict patients' access to personalized medical guidance. Large language models (LLMs) have emerged as potential tools to bridge this gap by providing instant, AI-driven health information. However, their accuracy and readability in educating patients with OMG remain uncertain. Objective: The purpose of this study was to systematically evaluate the effectiveness of multiple LLMs in the education of Chinese patients with OMG. Specifically, the validity of these models in answering patients with OMG-related questions was assessed through accuracy, completeness, readability, usefulness, and safety, and patients' ratings of their usability and readability were analyzed. Methods: The study was conducted in two phases: 130 choice ophthalmology examination questions were input into 5 different LLMs. Their performance was compared with that of undergraduates, master's students, and ophthalmology residents. In addition, 23 common patients with OMG-related patient questions were posed to 4 LLMs, and their responses were evaluated by ophthalmologists across 5 domains. In the second phase, 20 patients with OMG interacted with the 2 LLMs from the first phase, each asking 3 questions. Patients assessed the responses for satisfaction and readability, while ophthalmologists evaluated the responses again using the 5 domains. Results: ChatGPT o1-preview achieved the highest accuracy rate of 73\% on 130 ophthalmology examination questions, outperforming other LLMs and professional groups like undergraduates and master's students. For 23 common patients with OMG-related questions, ChatGPT o1-preview scored highest in correctness (4.44), completeness (4.44), helpfulness (4.47), and safety (4.6). GEMINI (Google DeepMind) provided the easiest-to-understand responses in readability assessments, while GPT-4o had the most complex responses, suitable for readers with higher education levels. In the second phase with 20 patients with OMG, ChatGPT o1-preview received higher satisfaction scores than Ernie 3.5 (Baidu; 4.40 vs 3.89, P=.002), although Ernie 3.5's responses were slightly more readable (4.31 vs 4.03, P=.01). Conclusions: LLMs such as ChatGPT o1-preview may have the potential to enhance patient education. Addressing challenges such as misinformation risk, readability issues, and ethical considerations is crucial for their effective and safe integration into clinical practice. ",
doi="10.2196/67883",
url="https://www.jmir.org/2025/1/e67883"
}


@Article{info:doi/10.2196/72998,
author="Zhang, Manlin
and Zhao, Tianyu",
title="Citation Accuracy Challenges Posed by Large Language Models",
journal="JMIR Med Educ",
year="2025",
month="Apr",
day="2",
volume="11",
pages="e72998",
keywords="chatGPT",
keywords="medical education",
keywords="Saudi Arabia",
keywords="perceptions",
keywords="knowledge",
keywords="medical students",
keywords="faculty",
keywords="chatbot",
keywords="qualitative study",
keywords="artificial intelligence",
keywords="AI",
keywords="AI-based tools",
keywords="universities",
keywords="thematic analysis",
keywords="learning",
keywords="satisfaction",
keywords="LLM",
keywords="large language model",
doi="10.2196/72998",
url="https://mededu.jmir.org/2025/1/e72998"
}


@Article{info:doi/10.2196/73698,
author="Temsah, Mohamad-Hani
and Al-Eyadhy, Ayman
and Jamal, Amr
and Alhasan, Khalid
and Malki, H. Khalid",
title="Authors' Reply: Citation Accuracy Challenges Posed by Large Language Models",
journal="JMIR Med Educ",
year="2025",
month="Apr",
day="2",
volume="11",
pages="e73698",
keywords="ChatGPT",
keywords="Gemini",
keywords="DeepSeek",
keywords="medical education",
keywords="AI",
keywords="artificial intelligence",
keywords="Saudi Arabia",
keywords="perceptions",
keywords="medical students",
keywords="faculty",
keywords="LLM",
keywords="chatbot",
keywords="qualitative study",
keywords="thematic analysis",
keywords="satisfaction",
keywords="RAG retrieval-augmented generation",
doi="10.2196/73698",
url="https://mededu.jmir.org/2025/1/e73698"
}


@Article{info:doi/10.2196/55709,
author="Montagna, Marco
and Chiabrando, Filippo
and De Lorenzo, Rebecca
and Rovere Querini, Patrizia
and ",
title="Impact of Clinical Decision Support Systems on Medical Students' Case-Solving Performance: Comparison Study with a Focus Group",
journal="JMIR Med Educ",
year="2025",
month="Mar",
day="18",
volume="11",
pages="e55709",
keywords="chatGPT",
keywords="chatbot",
keywords="machine learning",
keywords="ML",
keywords="artificial intelligence",
keywords="AI",
keywords="algorithm",
keywords="predictive model",
keywords="predictive analytics",
keywords="predictive system",
keywords="practical model",
keywords="deep learning",
keywords="large language models",
keywords="LLMs",
keywords="medical education",
keywords="medical teaching",
keywords="teaching environment",
keywords="clinical decision support systems",
keywords="CDSS",
keywords="decision support",
keywords="decision support tool",
keywords="clinical decision-making",
keywords="innovative teaching",
abstract="Background: Health care practitioners use clinical decision support systems (CDSS) as an aid in the crucial task of clinical reasoning and decision-making. Traditional CDSS are online repositories (ORs) and clinical practice guidelines (CPG). Recently, large language models (LLMs) such as ChatGPT have emerged as potential alternatives. They have proven to be powerful, innovative tools, yet they are not devoid of worrisome risks. Objective: This study aims to explore how medical students perform in an evaluated clinical case through the use of different CDSS tools. Methods: The authors randomly divided medical students into 3 groups, CPG, n=6 (38\%); OR, n=5 (31\%); and ChatGPT, n=5 (31\%); and assigned each group a different type of CDSS for guidance in answering prespecified questions, assessing how students' speed and ability at resolving the same clinical case varied accordingly. External reviewers evaluated all answers based on accuracy and completeness metrics (score: 1?5). The authors analyzed and categorized group scores according to the skill investigated: differential diagnosis, diagnostic workup, and clinical decision-making. Results: Answering time showed a trend for the ChatGPT group to be the fastest. The mean scores for completeness were as follows: CPG 4.0, OR 3.7, and ChatGPT 3.8 (P=.49). The mean scores for accuracy were as follows: CPG 4.0, OR 3.3, and ChatGPT 3.7 (P=.02). Aggregating scores according to the 3 students' skill domains, trends in differences among the groups emerge more clearly, with the CPG group that performed best in nearly all domains and maintained almost perfect alignment between its completeness and accuracy. Conclusions: This hands-on session provided valuable insights into the potential perks and associated pitfalls of LLMs in medical education and practice. It suggested the critical need to include teachings in medical degree courses on how to properly take advantage of LLMs, as the potential for misuse is evident and real. ",
doi="10.2196/55709",
url="https://mededu.jmir.org/2025/1/e55709"
}


@Article{info:doi/10.2196/59210,
author="Monzon, Noahlana
and Hays, Alan Franklin",
title="Leveraging Generative Artificial Intelligence to Improve Motivation and Retrieval in Higher Education Learners",
journal="JMIR Med Educ",
year="2025",
month="Mar",
day="11",
volume="11",
pages="e59210",
keywords="educational technology",
keywords="retrieval practice",
keywords="flipped classroom",
keywords="cognitive engagement",
keywords="personalized learning",
keywords="generative artificial intelligence",
keywords="higher education",
keywords="university education",
keywords="learners",
keywords="instructors",
keywords="curriculum structure",
keywords="learning",
keywords="technologies",
keywords="innovation",
keywords="academic misconduct",
keywords="gamification",
keywords="self-directed",
keywords="socio-economic disparities",
keywords="interactive approach",
keywords="medical education",
keywords="chatGPT",
keywords="machine learning",
keywords="AI",
keywords="large language models",
doi="10.2196/59210",
url="https://mededu.jmir.org/2025/1/e59210"
}


@Article{info:doi/10.2196/62779,
author="Doru, Berin
and Maier, Christoph
and Busse, Sophie Johanna
and L{\"u}cke, Thomas
and Sch{\"o}nhoff, Judith
and Enax- Krumova, Elena
and Hessler, Steffen
and Berger, Maria
and Tokic, Marianne",
title="Detecting Artificial Intelligence--Generated Versus Human-Written Medical Student Essays: Semirandomized Controlled Study",
journal="JMIR Med Educ",
year="2025",
month="Mar",
day="3",
volume="11",
pages="e62779",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="large language models",
keywords="textual analysis",
keywords="writing style",
keywords="AI",
keywords="chatbot",
keywords="LLMs",
keywords="detection",
keywords="authorship",
keywords="medical student",
keywords="linguistic quality",
keywords="decision-making",
keywords="logical coherence",
abstract="Background: Large language models, exemplified by ChatGPT, have reached a level of sophistication that makes distinguishing between human- and artificial intelligence (AI)--generated texts increasingly challenging. This has raised concerns in academia, particularly in medicine, where the accuracy and authenticity of written work are paramount. Objective: This semirandomized controlled study aims to examine the ability of 2 blinded expert groups with different levels of content familiarity---medical professionals and humanities scholars with expertise in textual analysis---to distinguish between longer scientific texts in German written by medical students and those generated by ChatGPT. Additionally, the study sought to analyze the reasoning behind their identification choices, particularly the role of content familiarity and linguistic features. Methods: Between May and August 2023, a total of 35 experts (medical: n=22; humanities: n=13) were each presented with 2 pairs of texts on different medical topics. Each pair had similar content and structure: 1 text was written by a medical student, and the other was generated by ChatGPT (version 3.5, March 2023). Experts were asked to identify the AI-generated text and justify their choice. These justifications were analyzed through a multistage, interdisciplinary qualitative analysis to identify relevant textual features. Before unblinding, experts rated each text on 6 characteristics: linguistic fluency and spelling/grammatical accuracy, scientific quality, logical coherence, expression of knowledge limitations, formulation of future research questions, and citation quality. Univariate tests and multivariate logistic regression analyses were used to examine associations between participants' characteristics, their stated reasons for author identification, and the likelihood of correctly determining a text's authorship. Results: Overall, in 48 out of 69 (70\%) decision rounds, participants accurately identified the AI-generated texts, with minimal difference between groups (medical: 31/43, 72\%; humanities: 17/26, 65\%; odds ratio [OR] 1.37, 95\% CI 0.5-3.9). While content errors had little impact on identification accuracy, stylistic features---particularly redundancy (OR 6.90, 95\% CI 1.01-47.1), repetition (OR 8.05, 95\% CI 1.25-51.7), and thread/coherence (OR 6.62, 95\% CI 1.25-35.2)---played a crucial role in participants' decisions to identify a text as AI-generated. Conclusions: The findings suggest that both medical and humanities experts were able to identify ChatGPT-generated texts in medical contexts, with their decisions largely based on linguistic attributes. The accuracy of identification appears to be independent of experts' familiarity with the text content. As the decision-making process primarily relies on linguistic attributes---such as stylistic features and text coherence---further quasi-experimental studies using texts from other academic disciplines should be conducted to determine whether instructions based on these features can enhance lecturers' ability to distinguish between student-authored and AI-generated work. ",
doi="10.2196/62779",
url="https://mededu.jmir.org/2025/1/e62779",
url="http://www.ncbi.nlm.nih.gov/pubmed/40053752"
}


@Article{info:doi/10.2196/63400,
author="Abouammoh, Noura
and Alhasan, Khalid
and Aljamaan, Fadi
and Raina, Rupesh
and Malki, H. Khalid
and Altamimi, Ibraheem
and Muaygil, Ruaim
and Wahabi, Hayfaa
and Jamal, Amr
and Alhaboob, Ali
and Assiri, Assad Rasha
and Al-Tawfiq, A. Jaffar
and Al-Eyadhy, Ayman
and Soliman, Mona
and Temsah, Mohamad-Hani",
title="Perceptions and Earliest Experiences of Medical Students and Faculty With ChatGPT in Medical Education: Qualitative Study",
journal="JMIR Med Educ",
year="2025",
month="Feb",
day="20",
volume="11",
pages="e63400",
keywords="ChatGPT",
keywords="medical education",
keywords="Saudi Arabia",
keywords="perceptions",
keywords="knowledge",
keywords="medical students",
keywords="faculty",
keywords="chatbot",
keywords="qualitative study",
keywords="artificial intelligence",
keywords="AI",
keywords="AI-based tools",
keywords="universities",
keywords="thematic analysis",
keywords="learning",
keywords="satisfaction",
abstract="Background: With the rapid development of artificial intelligence technologies, there is a growing interest in the potential use of artificial intelligence--based tools like ChatGPT in medical education. However, there is limited research on the initial perceptions and experiences of faculty and students with ChatGPT, particularly in Saudi Arabia. Objective: This study aimed to explore the earliest knowledge, perceived benefits, concerns, and limitations of using ChatGPT in medical education among faculty and students at a leading Saudi Arabian university. Methods: A qualitative exploratory study was conducted in April 2023, involving focused meetings with medical faculty and students with varying levels of ChatGPT experience. A thematic analysis was used to identify key themes and subthemes emerging from the discussions. Results: Participants demonstrated good knowledge of ChatGPT and its functions. The main themes were perceptions of ChatGPT use, potential benefits, and concerns about ChatGPT in research and medical education. The perceived benefits included collecting and summarizing information and saving time and effort. However, concerns and limitations centered around the potential lack of critical thinking in the information provided, the ambiguity of references, limitations of access, trust in the output of ChatGPT, and ethical concerns. Conclusions: This study provides valuable insights into the perceptions and experiences of medical faculty and students regarding the use of newly introduced large language models like ChatGPT in medical education. While the benefits of ChatGPT were recognized, participants also expressed concerns and limitations requiring further studies for effective integration into medical education, exploring the impact of ChatGPT on learning outcomes, student and faculty satisfaction, and the development of critical thinking skills. ",
doi="10.2196/63400",
url="https://mededu.jmir.org/2025/1/e63400",
url="http://www.ncbi.nlm.nih.gov/pubmed/39977012"
}


@Article{info:doi/10.2196/58766,
author="Ichikawa, Tsunagu
and Olsen, Elizabeth
and Vinod, Arathi
and Glenn, Noah
and Hanna, Karim
and Lund, C. Gregg
and Pierce-Talsma, Stacey",
title="Generative Artificial Intelligence in Medical Education---Policies and Training at US Osteopathic Medical Schools: Descriptive Cross-Sectional Survey",
journal="JMIR Med Educ",
year="2025",
month="Feb",
day="11",
volume="11",
pages="e58766",
keywords="artificial intelligence",
keywords="medical education",
keywords="faculty development",
keywords="policy",
keywords="AI",
keywords="training",
keywords="United States",
keywords="school",
keywords="university",
keywords="college",
keywords="institution",
keywords="osteopathic",
keywords="osteopathy",
keywords="curriculum",
keywords="student",
keywords="faculty",
keywords="administrator",
keywords="survey",
keywords="cross-sectional",
abstract="Background: Interest has recently increased in generative artificial intelligence (GenAI), a subset of artificial intelligence that can create new content. Although the publicly available GenAI tools are not specifically trained in the medical domain, they have demonstrated proficiency in a wide range of medical assessments. The future integration of GenAI in medicine remains unknown. However, the rapid availability of GenAI with a chat interface and the potential risks and benefits are the focus of great interest. As with any significant medical advancement or change, medical schools must adapt their curricula to equip students with the skills necessary to become successful physicians. Furthermore, medical schools must ensure that faculty members have the skills to harness these new opportunities to increase their effectiveness as educators. How medical schools currently fulfill their responsibilities is unclear. Colleges of Osteopathic Medicine (COMs) in the United States currently train a significant proportion of the total number of medical students. These COMs are in academic settings ranging from large public research universities to small private institutions. Therefore, studying COMs will offer a representative sample of the current GenAI integration in medical education. Objective: This study aims to describe the policies and training regarding the specific aspect of GenAI in US COMs, targeting students, faculty, and administrators. Methods: Web-based surveys were sent to deans and Student Government Association (SGA) presidents of the main campuses of fully accredited US COMs. The dean survey included questions regarding current and planned policies and training related to GenAI for students, faculty, and administrators. The SGA president survey included only those questions related to current student policies and training. Results: Responses were received from 81\% (26/32) of COMs surveyed. This included 47\% (15/32) of the deans and 50\% (16/32) of the SGA presidents (with 5 COMs represented by both the deans and the SGA presidents). Most COMs did not have a policy on the student use of GenAI, as reported by the dean (14/15, 93\%) and the SGA president (14/16, 88\%). Of the COMs with no policy, 79\% (11/14) had no formal plans for policy development. Only 1 COM had training for students, which focused entirely on the ethics of using GenAI. Most COMs had no formal plans to provide mandatory (11/14, 79\%) or elective (11/15, 73\%) training. No COM had GenAI policies for faculty or administrators. Eighty percent had no formal plans for policy development. Furthermore, 33.3\% (5/15) of COMs had faculty or administrator GenAI training. Except for examination question development, there was no training to increase faculty or administrator capabilities and efficiency or to decrease their workload. Conclusions: The survey revealed that most COMs lack GenAI policies and training for students, faculty, and administrators. The few institutions with policies or training were extremely limited in scope. Most institutions without current training or policies had no formal plans for development. The lack of current policies and training initiatives suggests inadequate preparedness for integrating GenAI into the medical school environment, therefore, relegating the responsibility for ethical guidance and training to the individual COM member. ",
doi="10.2196/58766",
url="https://mededu.jmir.org/2025/1/e58766"
}


@Article{info:doi/10.2196/63065,
author="Elhassan, Elwaleed Safia
and Sajid, Raihan Muhammad
and Syed, Mariam Amina
and Fathima, Afreen Sidrah
and Khan, Shehroz Bushra
and Tamim, Hala",
title="Assessing Familiarity, Usage Patterns, and Attitudes of Medical Students Toward ChatGPT and Other Chat-Based AI Apps in Medical Education: Cross-Sectional Questionnaire Study",
journal="JMIR Med Educ",
year="2025",
month="Jan",
day="30",
volume="11",
pages="e63065",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="large language model",
keywords="medical students",
keywords="ethics",
keywords="chat-based",
keywords="AI apps",
keywords="medical education",
keywords="social media",
keywords="attitude",
keywords="AI",
abstract="Background: There has been a rise in the popularity of ChatGPT and other chat-based artificial intelligence (AI) apps in medical education. Despite data being available from other parts of the world, there is a significant lack of information on this topic in medical education and research, particularly in Saudi Arabia. Objective: The primary objective of the study was to examine the familiarity, usage patterns, and attitudes of Alfaisal University medical students toward ChatGPT and other chat-based AI apps in medical education. Methods: This was a cross-sectional study conducted from October 8, 2023, through November 22, 2023. A questionnaire was distributed through social media channels to medical students at Alfaisal University who were 18 years or older. Current Alfaisal University medical students in years 1 through 6, of both genders, were exclusively targeted by the questionnaire. The study was approved by Alfaisal University Institutional Review Board. A $\chi$2 test was conducted to assess the relationships between gender, year of study, familiarity, and reasons for usage. Results: A total of 293 responses were received, of which 95 (32.4\%) were from men and 198 (67.6\%) were from women. There were 236 (80.5\%) responses from preclinical students and 57 (19.5\%) from clinical students, respectively. Overall, males (n=93, 97.9\%) showed more familiarity with ChatGPT compared to females (n=180, 90.09\%; P=.03). Additionally, males also used Google Bard and Microsoft Bing ChatGPT more than females (P<.001). Clinical-year students used ChatGPT significantly more for general writing purposes compared to preclinical students (P=.005). Additionally, 136 (46.4\%) students believed that using ChatGPT and other chat-based AI apps for coursework was ethical, 86 (29.4\%) were neutral, and 71 (24.2\%) considered it unethical (all Ps>.05). Conclusions: Familiarity with and usage of ChatGPT and other chat-based AI apps were common among the students of Alfaisal University. The usage patterns of these apps differ between males and females and between preclinical and clinical-year students. ",
doi="10.2196/63065",
url="https://mededu.jmir.org/2025/1/e63065"
}


@Article{info:doi/10.2196/58898,
author="Kaewboonlert, Naritsaret
and Poontananggul, Jiraphon
and Pongsuwan, Natthipong
and Bhakdisongkhram, Gun",
title="Factors Associated With the Accuracy of Large Language Models in Basic Medical Science Examinations: Cross-Sectional Study",
journal="JMIR Med Educ",
year="2025",
month="Jan",
day="13",
volume="11",
pages="e58898",
keywords="accuracy",
keywords="performance",
keywords="artificial intelligence",
keywords="AI",
keywords="ChatGPT",
keywords="large language model",
keywords="LLM",
keywords="difficulty index",
keywords="basic medical science examination",
keywords="cross-sectional study",
keywords="medical education",
keywords="datasets",
keywords="assessment",
keywords="medical science",
keywords="tool",
keywords="Google",
abstract="Background: Artificial intelligence (AI) has become widely applied across many fields, including medical education. Content validation and its answers are based on training datasets and the optimization of each model. The accuracy of large language model (LLMs) in basic medical examinations and factors related to their accuracy have also been explored. Objective: We evaluated factors associated with the accuracy of LLMs (GPT-3.5, GPT-4, Google Bard, and Microsoft Bing) in answering multiple-choice questions from basic medical science examinations. Methods: We used questions that were closely aligned with the content and topic distribution of Thailand's Step 1 National Medical Licensing Examination. Variables such as the difficulty index, discrimination index, and question characteristics were collected. These questions were then simultaneously input into ChatGPT (with GPT-3.5 and GPT-4), Microsoft Bing, and Google Bard, and their responses were recorded. The accuracy of these LLMs and the associated factors were analyzed using multivariable logistic regression. This analysis aimed to assess the effect of various factors on model accuracy, with results reported as odds ratios (ORs). Results: The study revealed that GPT-4 was the top-performing model, with an overall accuracy of 89.07\% (95\% CI 84.76\%?92.41\%), significantly outperforming the others (P<.001). Microsoft Bing followed with an accuracy of 83.69\% (95\% CI 78.85\%?87.80\%), GPT-3.5 at 67.02\% (95\% CI 61.20\%?72.48\%), and Google Bard at 63.83\% (95\% CI 57.92\%?69.44\%). The multivariable logistic regression analysis showed a correlation between question difficulty and model performance, with GPT-4 demonstrating the strongest association. Interestingly, no significant correlation was found between model accuracy and question length, negative wording, clinical scenarios, or the discrimination index for most models, except for Google Bard, which showed varying correlations. Conclusions: The GPT-4 and Microsoft Bing models demonstrated equal and superior accuracy compared to GPT-3.5 and Google Bard in the domain of basic medical science. The accuracy of these models was significantly influenced by the item's difficulty index, indicating that the LLMs are more accurate when answering easier questions. This suggests that the more accurate models, such as GPT-4 and Bing, can be valuable tools for understanding and learning basic medical science concepts. ",
doi="10.2196/58898",
url="https://mededu.jmir.org/2025/1/e58898"
}


@Article{info:doi/10.2196/51435,
author="Dzuali, Fiatsogbe
and Seiger, Kira
and Novoa, Roberto
and Aleshin, Maria
and Teng, Joyce
and Lester, Jenna
and Daneshjou, Roxana",
title="ChatGPT May Improve Access to Language-Concordant Care for Patients With Non--English Language Preferences",
journal="JMIR Med Educ",
year="2024",
month="Dec",
day="10",
volume="10",
pages="e51435",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="language",
keywords="translation",
keywords="health care disparity",
keywords="natural language model",
keywords="survey",
keywords="patient education",
keywords="preference",
keywords="human language",
keywords="language-concordant care",
doi="10.2196/51435",
url="https://mededu.jmir.org/2024/1/e51435"
}


@Article{info:doi/10.2196/59902,
author="Huang, Ting-Yun
and Hsieh, Hsing Pei
and Chang, Yung-Chun",
title="Performance Comparison of Junior Residents and ChatGPT in the Objective Structured Clinical Examination (OSCE) for Medical History Taking and Documentation of Medical Records: Development and Usability Study",
journal="JMIR Med Educ",
year="2024",
month="Nov",
day="21",
volume="10",
pages="e59902",
keywords="large language model",
keywords="medical history taking",
keywords="clinical documentation",
keywords="simulation-based evaluation",
keywords="OSCE standards",
keywords="LLM",
abstract="Background: This study explores the cutting-edge abilities of large language models (LLMs) such as ChatGPT in medical history taking and medical record documentation, with a focus on their practical effectiveness in clinical settings---an area vital for the progress of medical artificial intelligence. Objective: Our aim was to assess the capability of ChatGPT versions 3.5 and 4.0 in performing medical history taking and medical record documentation in simulated clinical environments. The study compared the performance of nonmedical individuals using ChatGPT with that of junior medical residents. Methods: A simulation involving standardized patients was designed to mimic authentic medical history--taking interactions. Five nonmedical participants used ChatGPT versions 3.5 and 4.0 to conduct medical histories and document medical records, mirroring the tasks performed by 5 junior residents in identical scenarios. A total of 10 diverse scenarios were examined. Results: Evaluation of the medical documentation created by laypersons with ChatGPT assistance and those created by junior residents was conducted by 2 senior emergency physicians using audio recordings and the final medical records. The assessment used the Objective Structured Clinical Examination benchmarks in Taiwan as a reference. ChatGPT-4.0 exhibited substantial enhancements over its predecessor and met or exceeded the performance of human counterparts in terms of both checklist and global assessment scores. Although the overall quality of human consultations remained higher, ChatGPT-4.0's proficiency in medical documentation was notably promising. Conclusions: The performance of ChatGPT 4.0 was on par with that of human participants in Objective Structured Clinical Examination evaluations, signifying its potential in medical history and medical record documentation. Despite this, the superiority of human consultations in terms of quality was evident. The study underscores both the promise and the current limitations of LLMs in the realm of clinical practice. ",
doi="10.2196/59902",
url="https://mededu.jmir.org/2024/1/e59902"
}


@Article{info:doi/10.2196/51433,
author="Ehrett, Carl
and Hegde, Sudeep
and Andre, Kwame
and Liu, Dixizi
and Wilson, Timothy",
title="Leveraging Open-Source Large Language Models for Data Augmentation in Hospital Staff Surveys: Mixed Methods Study",
journal="JMIR Med Educ",
year="2024",
month="Nov",
day="19",
volume="10",
pages="e51433",
keywords="data augmentation",
keywords="large language models",
keywords="medical education",
keywords="natural language processing",
keywords="data security",
keywords="ethics",
keywords="AI",
keywords="artificial intelligence",
keywords="data privacy",
keywords="medical staff",
abstract="Background: Generative large language models (LLMs) have the potential to revolutionize medical education by generating tailored learning materials, enhancing teaching efficiency, and improving learner engagement. However, the application of LLMs in health care settings, particularly for augmenting small datasets in text classification tasks, remains underexplored, particularly for cost- and privacy-conscious applications that do not permit the use of third-party services such as OpenAI's ChatGPT. Objective: This study aims to explore the use of open-source LLMs, such as Large Language Model Meta AI (LLaMA) and Alpaca models, for data augmentation in a specific text classification task related to hospital staff surveys. Methods: The surveys were designed to elicit narratives of everyday adaptation by frontline radiology staff during the initial phase of the COVID-19 pandemic. A 2-step process of data augmentation and text classification was conducted. The study generated synthetic data similar to the survey reports using 4 generative LLMs for data augmentation. A different set of 3 classifier LLMs was then used to classify the augmented text for thematic categories. The study evaluated performance on the classification task. Results: The overall best-performing combination of LLMs, temperature, classifier, and number of synthetic data cases is via augmentation with LLaMA 7B at temperature 0.7 with 100 augments, using Robustly Optimized BERT Pretraining Approach (RoBERTa) for the classification task, achieving an average area under the receiver operating characteristic (AUC) curve of 0.87 (SD 0.02; ie, 1 SD). The results demonstrate that open-source LLMs can enhance text classifiers' performance for small datasets in health care contexts, providing promising pathways for improving medical education processes and patient care practices. Conclusions: The study demonstrates the value of data augmentation with open-source LLMs, highlights the importance of privacy and ethical considerations when using LLMs, and suggests future directions for research in this field. ",
doi="10.2196/51433",
url="https://mededu.jmir.org/2024/1/e51433"
}


@Article{info:doi/10.2196/54297,
author="Zhou, You
and Li, Si-Jia
and Tang, Xing-Yi
and He, Yi-Chen
and Ma, Hao-Ming
and Wang, Ao-Qi
and Pei, Run-Yuan
and Piao, Mei-Hua",
title="Using ChatGPT in Nursing: Scoping Review of Current Opinions",
journal="JMIR Med Educ",
year="2024",
month="Nov",
day="19",
volume="10",
pages="e54297",
keywords="ChatGPT",
keywords="large language model",
keywords="nursing",
keywords="artificial intelligence",
keywords="scoping review",
keywords="generative AI",
keywords="nursing education",
abstract="Background: Since the release of ChatGPT in November 2022, this emerging technology has garnered a lot of attention in various fields, and nursing is no exception. However, to date, no study has comprehensively summarized the status and opinions of using ChatGPT across different nursing fields. Objective: We aim to synthesize the status and opinions of using ChatGPT according to different nursing fields, as well as assess ChatGPT's strengths, weaknesses, and the potential impacts it may cause. Methods: This scoping review was conducted following the framework of Arksey and O'Malley and guided by the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews). A comprehensive literature research was conducted in 4 web-based databases (PubMed, Embase, Web of Science, and CINHAL) to identify studies reporting the opinions of using ChatGPT in nursing fields from 2022 to September 3, 2023. The references of the included studies were screened manually to further identify relevant studies. Two authors conducted studies screening, eligibility assessments, and data extraction independently. Results: A total of 30 studies were included. The United States (7 studies), Canada (5 studies), and China (4 studies) were countries with the most publications. In terms of fields of concern, studies mainly focused on ``ChatGPT and nursing education'' (20 studies), ``ChatGPT and nursing practice'' (10 studies), and ``ChatGPT and nursing research, writing, and examination'' (6 studies). Six studies addressed the use of ChatGPT in multiple nursing fields. Conclusions: As an emerging artificial intelligence technology, ChatGPT has great potential to revolutionize nursing education, nursing practice, and nursing research. However, researchers, institutions, and administrations still need to critically examine its accuracy, safety, and privacy, as well as academic misconduct and potential ethical issues that it may lead to before applying ChatGPT to practice. ",
doi="10.2196/54297",
url="https://mededu.jmir.org/2024/1/e54297"
}


@Article{info:doi/10.2196/56762,
author="Ros-Arlanz{\'o}n, Pablo
and Perez-Sempere, Angel",
title="Evaluating AI Competence in Specialized Medicine: Comparative Analysis of ChatGPT and Neurologists in a Neurology Specialist Examination in Spain",
journal="JMIR Med Educ",
year="2024",
month="Nov",
day="14",
volume="10",
pages="e56762",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="clinical decision-making",
keywords="medical education",
keywords="medical knowledge assessment",
keywords="OpenAI",
abstract="Background: With the rapid advancement of artificial intelligence (AI) in various fields, evaluating its application in specialized medical contexts becomes crucial. ChatGPT, a large language model developed by OpenAI, has shown potential in diverse applications, including medicine. Objective: This study aims to compare the performance of ChatGPT with that of attending neurologists in a real neurology specialist examination conducted in the Valencian Community, Spain, assessing the AI's capabilities and limitations in medical knowledge. Methods: We conducted a comparative analysis using the 2022 neurology specialist examination results from 120 neurologists and responses generated by ChatGPT versions 3.5 and 4. The examination consisted of 80 multiple-choice questions, with a focus on clinical neurology and health legislation. Questions were classified according to Bloom's Taxonomy. Statistical analysis of performance, including the $\kappa$ coefficient for response consistency, was performed. Results: Human participants exhibited a median score of 5.91 (IQR: 4.93-6.76), with 32 neurologists failing to pass. ChatGPT-3.5 ranked 116th out of 122, answering 54.5\% of questions correctly (score 3.94). ChatGPT-4 showed marked improvement, ranking 17th with 81.8\% of correct answers (score 7.57), surpassing several human specialists. No significant variations were observed in the performance on lower-order questions versus higher-order questions. Additionally, ChatGPT-4 demonstrated increased interrater reliability, as reflected by a higher $\kappa$ coefficient of 0.73, compared to ChatGPT-3.5's coefficient of 0.69. Conclusions: This study underscores the evolving capabilities of AI in medical knowledge assessment, particularly in specialized fields. ChatGPT-4's performance, outperforming the median score of human participants in a rigorous neurology examination, represents a significant milestone in AI development, suggesting its potential as an effective tool in specialized medical education and assessment. ",
doi="10.2196/56762",
url="https://mededu.jmir.org/2024/1/e56762"
}


@Article{info:doi/10.2196/56128,
author="Goodings, James Anthony
and Kajitani, Sten
and Chhor, Allison
and Albakri, Ahmad
and Pastrak, Mila
and Kodancha, Megha
and Ives, Rowan
and Lee, Bin Yoo
and Kajitani, Kari",
title="Assessment of ChatGPT-4 in Family Medicine Board Examinations Using Advanced AI Learning and Analytical Methods: Observational Study",
journal="JMIR Med Educ",
year="2024",
month="Oct",
day="8",
volume="10",
pages="e56128",
keywords="ChatGPT-4",
keywords="Family Medicine Board Examination",
keywords="artificial intelligence in medical education",
keywords="AI performance assessment",
keywords="prompt engineering",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="AI",
keywords="medical education",
keywords="assessment",
keywords="observational",
keywords="analytical method",
keywords="data analysis",
keywords="examination",
abstract="Background: This research explores the capabilities of ChatGPT-4 in passing the American Board of Family Medicine (ABFM) Certification Examination. Addressing a gap in existing literature, where earlier artificial intelligence (AI) models showed limitations in medical board examinations, this study evaluates the enhanced features and potential of ChatGPT-4, especially in document analysis and information synthesis. Objective: The primary goal is to assess whether ChatGPT-4, when provided with extensive preparation resources and when using sophisticated data analysis, can achieve a score equal to or above the passing threshold for the Family Medicine Board Examinations. Methods: In this study, ChatGPT-4 was embedded in a specialized subenvironment, ``AI Family Medicine Board Exam Taker,'' designed to closely mimic the conditions of the ABFM Certification Examination. This subenvironment enabled the AI to access and analyze a range of relevant study materials, including a primary medical textbook and supplementary web-based resources. The AI was presented with a series of ABFM-type examination questions, reflecting the breadth and complexity typical of the examination. Emphasis was placed on assessing the AI's ability to interpret and respond to these questions accurately, leveraging its advanced data processing and analysis capabilities within this controlled subenvironment. Results: In our study, ChatGPT-4's performance was quantitatively assessed on 300 practice ABFM examination questions. The AI achieved a correct response rate of 88.67\% (95\% CI 85.08\%-92.25\%) for the Custom Robot version and 87.33\% (95\% CI 83.57\%-91.10\%) for the Regular version. Statistical analysis, including the McNemar test (P=.45), indicated no significant difference in accuracy between the 2 versions. In addition, the chi-square test for error-type distribution (P=.32) revealed no significant variation in the pattern of errors across versions. These results highlight ChatGPT-4's capacity for high-level performance and consistency in responding to complex medical examination questions under controlled conditions. Conclusions: The study demonstrates that ChatGPT-4, particularly when equipped with specialized preparation and when operating in a tailored subenvironment, shows promising potential in handling the intricacies of medical board examinations. While its performance is comparable with the expected standards for passing the ABFM Certification Examination, further enhancements in AI technology and tailored training methods could push these capabilities to new heights. This exploration opens avenues for integrating AI tools such as ChatGPT-4 in medical education and assessment, emphasizing the importance of continuous advancement and specialized training in medical applications of AI. ",
doi="10.2196/56128",
url="https://mededu.jmir.org/2024/1/e56128"
}


@Article{info:doi/10.2196/52746,
author="Wu, Zelin
and Gan, Wenyi
and Xue, Zhaowen
and Ni, Zhengxin
and Zheng, Xiaofei
and Zhang, Yiyi",
title="Performance of ChatGPT on Nursing Licensure Examinations in the United States and China: Cross-Sectional Study",
journal="JMIR Med Educ",
year="2024",
month="Oct",
day="3",
volume="10",
pages="e52746",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="nursing licensure examination",
keywords="nursing",
keywords="LLMs",
keywords="large language models",
keywords="nursing education",
keywords="AI",
keywords="nursing student",
keywords="large language model",
keywords="licensing",
keywords="observation",
keywords="observational study",
keywords="China",
keywords="USA",
keywords="United States of America",
keywords="auxiliary tool",
keywords="accuracy rate",
keywords="theoretical",
abstract="Background: The creation of large language models (LLMs) such as ChatGPT is an important step in the development of artificial intelligence, which shows great potential in medical education due to its powerful language understanding and generative capabilities. The purpose of this study was to quantitatively evaluate and comprehensively analyze ChatGPT's performance in handling questions for the National Nursing Licensure Examination (NNLE) in China and the United States, including the National Council Licensure Examination for Registered Nurses (NCLEX-RN) and the NNLE. Objective: This study aims to examine how well LLMs respond to the NCLEX-RN and the NNLE multiple-choice questions (MCQs) in various language inputs. To evaluate whether LLMs can be used as multilingual learning assistance for nursing, and to assess whether they possess a repository of professional knowledge applicable to clinical nursing practice. Methods: First, we compiled 150 NCLEX-RN Practical MCQs, 240 NNLE Theoretical MCQs, and 240 NNLE Practical MCQs. Then, the translation function of ChatGPT 3.5 was used to translate NCLEX-RN questions from English to Chinese and NNLE questions from Chinese to English. Finally, the original version and the translated version of the MCQs were inputted into ChatGPT 4.0, ChatGPT 3.5, and Google Bard. Different LLMs were compared according to the accuracy rate, and the differences between different language inputs were compared. Results: The accuracy rates of ChatGPT 4.0 for NCLEX-RN practical questions and Chinese-translated NCLEX-RN practical questions were 88.7\% (133/150) and 79.3\% (119/150), respectively. Despite the statistical significance of the difference (P=.03), the correct rate was generally satisfactory. Around 71.9\% (169/235) of NNLE Theoretical MCQs and 69.1\% (161/233) of NNLE Practical MCQs were correctly answered by ChatGPT 4.0. The accuracy of ChatGPT 4.0 in processing NNLE Theoretical MCQs and NNLE Practical MCQs translated into English was 71.5\% (168/235; P=.92) and 67.8\% (158/233; P=.77), respectively, and there was no statistically significant difference between the results of text input in different languages. ChatGPT 3.5 (NCLEX-RN P=.003, NNLE Theoretical P<.001, NNLE Practical P=.12) and Google Bard (NCLEX-RN P<.001, NNLE Theoretical P<.001, NNLE Practical P<.001) had lower accuracy rates for nursing-related MCQs than ChatGPT 4.0 in English input. English accuracy was higher when compared with ChatGPT 3.5's Chinese input, and the difference was statistically significant (NCLEX-RN P=.02, NNLE Practical P=.02). Whether submitted in Chinese or English, the MCQs from the NCLEX-RN and NNLE demonstrated that ChatGPT 4.0 had the highest number of unique correct responses and the lowest number of unique incorrect responses among the 3 LLMs. Conclusions: This study, focusing on 618 nursing MCQs including NCLEX-RN and NNLE exams, found that ChatGPT 4.0 outperformed ChatGPT 3.5 and Google Bard in accuracy. It excelled in processing English and Chinese inputs, underscoring its potential as a valuable tool in nursing education and clinical decision-making. ",
doi="10.2196/52746",
url="https://mededu.jmir.org/2024/1/e52746"
}


@Article{info:doi/10.2196/52346,
author="Claman, Daniel
and Sezgin, Emre",
title="Artificial Intelligence in Dental Education: Opportunities and Challenges of Large Language Models and Multimodal Foundation Models",
journal="JMIR Med Educ",
year="2024",
month="Sep",
day="27",
volume="10",
pages="e52346",
keywords="artificial intelligence",
keywords="large language models",
keywords="dental education",
keywords="GPT",
keywords="ChatGPT",
keywords="periodontal health",
keywords="AI",
keywords="LLM",
keywords="LLMs",
keywords="chatbot",
keywords="natural language",
keywords="generative pretrained transformer",
keywords="innovation",
keywords="technology",
keywords="large language model",
doi="10.2196/52346",
url="https://mededu.jmir.org/2024/1/e52346"
}


@Article{info:doi/10.2196/56859,
author="Yoon, Soo-Hyuk
and Oh, Kyeong Seok
and Lim, Gun Byung
and Lee, Ho-Jin",
title="Performance of ChatGPT in the In-Training Examination for Anesthesiology and Pain Medicine Residents in South Korea: Observational Study",
journal="JMIR Med Educ",
year="2024",
month="Sep",
day="16",
volume="10",
pages="e56859",
keywords="AI tools",
keywords="problem solving",
keywords="anesthesiology",
keywords="artificial intelligence",
keywords="pain medicine",
keywords="ChatGPT",
keywords="health care",
keywords="medical education",
keywords="South Korea",
abstract="Background: ChatGPT has been tested in health care, including the US Medical Licensing Examination and specialty exams, showing near-passing results. Its performance in the field of anesthesiology has been assessed using English board examination questions; however, its effectiveness in Korea remains unexplored. Objective: This study investigated the problem-solving performance of ChatGPT in the fields of anesthesiology and pain medicine in the Korean language context, highlighted advancements in artificial intelligence (AI), and explored its potential applications in medical education. Methods: We investigated the performance (number of correct answers/number of questions) of GPT-4, GPT-3.5, and CLOVA X in the fields of anesthesiology and pain medicine, using in-training examinations that have been administered to Korean anesthesiology residents over the past 5 years, with an annual composition of 100 questions. Questions containing images, diagrams, or photographs were excluded from the analysis. Furthermore, to assess the performance differences of the GPT across different languages, we conducted a comparative analysis of the GPT-4's problem-solving proficiency using both the original Korean texts and their English translations. Results: A total of 398 questions were analyzed. GPT-4 (67.8\%) demonstrated a significantly better overall performance than GPT-3.5 (37.2\%) and CLOVA-X (36.7\%). However, GPT-3.5 and CLOVA X did not show significant differences in their overall performance. Additionally, the GPT-4 showed superior performance on questions translated into English, indicating a language processing discrepancy (English: 75.4\% vs Korean: 67.8\%; difference 7.5\%; 95\% CI 3.1\%-11.9\%; P=.001). Conclusions: This study underscores the potential of AI tools, such as ChatGPT, in medical education and practice but emphasizes the need for cautious application and further refinement, especially in non-English medical contexts. The findings suggest that although AI advancements are promising, they require careful evaluation and development to ensure acceptable performance across diverse linguistic and professional settings. ",
doi="10.2196/56859",
url="https://mededu.jmir.org/2024/1/e56859"
}


@Article{info:doi/10.2196/60501,
author="Zaghir, Jamil
and Naguib, Marco
and Bjelogrlic, Mina
and N{\'e}v{\'e}ol, Aur{\'e}lie
and Tannier, Xavier
and Lovis, Christian",
title="Prompt Engineering Paradigms for Medical Applications: Scoping Review",
journal="J Med Internet Res",
year="2024",
month="Sep",
day="10",
volume="26",
pages="e60501",
keywords="prompt engineering",
keywords="prompt design",
keywords="prompt learning",
keywords="prompt tuning",
keywords="large language models",
keywords="LLMs",
keywords="scoping review",
keywords="clinical natural language processing",
keywords="natural language processing",
keywords="NLP",
keywords="medical texts",
keywords="medical application",
keywords="medical applications",
keywords="clinical practice",
keywords="privacy",
keywords="medicine",
keywords="computer science",
keywords="medical informatics",
abstract="Background: Prompt engineering, focusing on crafting effective prompts to large language models (LLMs), has garnered attention for its capabilities at harnessing the potential of LLMs. This is even more crucial in the medical domain due to its specialized terminology and language technicity. Clinical natural language processing applications must navigate complex language and ensure privacy compliance. Prompt engineering offers a novel approach by designing tailored prompts to guide models in exploiting clinically relevant information from complex medical texts. Despite its promise, the efficacy of prompt engineering in the medical domain remains to be fully explored. Objective: The aim of the study is to review research efforts and technical approaches in prompt engineering for medical applications as well as provide an overview of opportunities and challenges for clinical practice. Methods: Databases indexing the fields of medicine, computer science, and medical informatics were queried in order to identify relevant published papers. Since prompt engineering is an emerging field, preprint databases were also considered. Multiple data were extracted, such as the prompt paradigm, the involved LLMs, the languages of the study, the domain of the topic, the baselines, and several learning, design, and architecture strategies specific to prompt engineering. We include studies that apply prompt engineering--based methods to the medical domain, published between 2022 and 2024, and covering multiple prompt paradigms such as prompt learning (PL), prompt tuning (PT), and prompt design (PD). Results: We included 114 recent prompt engineering studies. Among the 3 prompt paradigms, we have observed that PD is the most prevalent (78 papers). In 12 papers, PD, PL, and PT terms were used interchangeably. While ChatGPT is the most commonly used LLM, we have identified 7 studies using this LLM on a sensitive clinical data set. Chain-of-thought, present in 17 studies, emerges as the most frequent PD technique. While PL and PT papers typically provide a baseline for evaluating prompt-based approaches, 61\% (48/78) of the PD studies do not report any nonprompt-related baseline. Finally, we individually examine each of the key prompt engineering--specific information reported across papers and find that many studies neglect to explicitly mention them, posing a challenge for advancing prompt engineering research. Conclusions: In addition to reporting on trends and the scientific landscape of prompt engineering, we provide reporting guidelines for future studies to help advance research in the medical field. We also disclose tables and figures summarizing medical prompt engineering papers available and hope that future contributions will leverage these existing works to better advance the field. ",
doi="10.2196/60501",
url="https://www.jmir.org/2024/1/e60501",
url="http://www.ncbi.nlm.nih.gov/pubmed/39255030"
}


@Article{info:doi/10.2196/58478,
author="Reis, Florian
and Lenz, Christian
and Gossen, Manfred
and Volk, Hans-Dieter
and Drzeniek, Michael Norman",
title="Practical Applications of Large Language Models for Health Care Professionals and Scientists",
journal="JMIR Med Inform",
year="2024",
month="Sep",
day="5",
volume="12",
pages="e58478",
keywords="artificial intelligence",
keywords="healthcare",
keywords="chatGPT",
keywords="large language model",
keywords="prompting",
keywords="LLM",
keywords="applications",
keywords="AI",
keywords="scientists",
keywords="physicians",
keywords="health care",
doi="10.2196/58478",
url="https://medinform.jmir.org/2024/1/e58478"
}


@Article{info:doi/10.2196/57896,
author="Xu, Tianhui
and Weng, Huiting
and Liu, Fang
and Yang, Li
and Luo, Yuanyuan
and Ding, Ziwei
and Wang, Qin",
title="Current Status of ChatGPT Use in Medical Education: Potentials, Challenges, and Strategies",
journal="J Med Internet Res",
year="2024",
month="Aug",
day="28",
volume="26",
pages="e57896",
keywords="chat generative pretrained transformer",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="medical education",
keywords="natural language processing",
keywords="clinical practice",
doi="10.2196/57896",
url="https://www.jmir.org/2024/1/e57896",
url="http://www.ncbi.nlm.nih.gov/pubmed/39196640"
}


@Article{info:doi/10.2196/50545,
author="Thomae, V. Anita
and Witt, M. Claudia
and Barth, J{\"u}rgen",
title="Integration of ChatGPT Into a Course for Medical Students: Explorative Study on Teaching Scenarios, Students' Perception, and Applications",
journal="JMIR Med Educ",
year="2024",
month="Aug",
day="22",
volume="10",
pages="e50545",
keywords="medical education",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="information for patients",
keywords="critical appraisal",
keywords="evaluation",
keywords="blended learning",
keywords="AI",
keywords="digital skills",
keywords="teaching",
abstract="Background: Text-generating artificial intelligence (AI) such as ChatGPT offers many opportunities and challenges in medical education. Acquiring practical skills necessary for using AI in a clinical context is crucial, especially for medical education. Objective: This explorative study aimed to investigate the feasibility of integrating ChatGPT into teaching units and to evaluate the course and the importance of AI-related competencies for medical students. Since a possible application of ChatGPT in the medical field could be the generation of information for patients, we further investigated how such information is perceived by students in terms of persuasiveness and quality. Methods: ChatGPT was integrated into 3 different teaching units of a blended learning course for medical students. Using a mixed methods approach, quantitative and qualitative data were collected. As baseline data, we assessed students' characteristics, including their openness to digital innovation. The students evaluated the integration of ChatGPT into the course and shared their thoughts regarding the future of text-generating AI in medical education. The course was evaluated based on the Kirkpatrick Model, with satisfaction, learning progress, and applicable knowledge considered as key assessment levels. In ChatGPT-integrating teaching units, students evaluated videos featuring information for patients regarding their persuasiveness on treatment expectations in a self-experience experiment and critically reviewed information for patients written using ChatGPT 3.5 based on different prompts. Results: A total of 52 medical students participated in the study. The comprehensive evaluation of the course revealed elevated levels of satisfaction, learning progress, and applicability specifically in relation to the ChatGPT-integrating teaching units. Furthermore, all evaluation levels demonstrated an association with each other. Higher openness to digital innovation was associated with higher satisfaction and, to a lesser extent, with higher applicability. AI-related competencies in other courses of the medical curriculum were perceived as highly important by medical students. Qualitative analysis highlighted potential use cases of ChatGPT in teaching and learning. In ChatGPT-integrating teaching units, students rated information for patients generated using a basic ChatGPT prompt as ``moderate'' in terms of comprehensibility, patient safety, and the correct application of communication rules taught during the course. The students' ratings were considerably improved using an extended prompt. The same text, however, showed the smallest increase in treatment expectations when compared with information provided by humans (patient, clinician, and expert) via videos. Conclusions: This study offers valuable insights into integrating the development of AI competencies into a blended learning course. Integration of ChatGPT enhanced learning experiences for medical students. ",
doi="10.2196/50545",
url="https://mededu.jmir.org/2024/1/e50545"
}


@Article{info:doi/10.2196/59213,
author="Holderried, Friederike
and Stegemann-Philipps, Christian
and Herrmann-Werner, Anne
and Festl-Wietek, Teresa
and Holderried, Martin
and Eickhoff, Carsten
and Mahling, Moritz",
title="A Language Model--Powered Simulated Patient With Automated Feedback for History Taking: Prospective Study",
journal="JMIR Med Educ",
year="2024",
month="Aug",
day="16",
volume="10",
pages="e59213",
keywords="virtual patients communication",
keywords="communication skills",
keywords="technology enhanced education",
keywords="TEL",
keywords="medical education",
keywords="ChatGPT",
keywords="GPT: LLM",
keywords="LLMs",
keywords="NLP",
keywords="natural language processing",
keywords="machine learning",
keywords="artificial intelligence",
keywords="language model",
keywords="language models",
keywords="communication",
keywords="relationship",
keywords="relationships",
keywords="chatbot",
keywords="chatbots",
keywords="conversational agent",
keywords="conversational agents",
keywords="history",
keywords="histories",
keywords="simulated",
keywords="student",
keywords="students",
keywords="interaction",
keywords="interactions",
abstract="Background: Although history taking is fundamental for diagnosing medical conditions, teaching and providing feedback on the skill can be challenging due to resource constraints. Virtual simulated patients and web-based chatbots have thus emerged as educational tools, with recent advancements in artificial intelligence (AI) such as large language models (LLMs) enhancing their realism and potential to provide feedback. Objective: In our study, we aimed to evaluate the effectiveness of a Generative Pretrained Transformer (GPT) 4 model to provide structured feedback on medical students' performance in history taking with a simulated patient. Methods: We conducted a prospective study involving medical students performing history taking with a GPT-powered chatbot. To that end, we designed a chatbot to simulate patients' responses and provide immediate feedback on the comprehensiveness of the students' history taking. Students' interactions with the chatbot were analyzed, and feedback from the chatbot was compared with feedback from a human rater. We measured interrater reliability and performed a descriptive analysis to assess the quality of feedback. Results: Most of the study's participants were in their third year of medical school. A total of 1894 question-answer pairs from 106 conversations were included in our analysis. GPT-4's role-play and responses were medically plausible in more than 99\% of cases. Interrater reliability between GPT-4 and the human rater showed ``almost perfect'' agreement (Cohen $\kappa$=0.832). Less agreement ($\kappa$<0.6) detected for 8 out of 45 feedback categories highlighted topics about which the model's assessments were overly specific or diverged from human judgement. Conclusions: The GPT model was effective in providing structured feedback on history-taking dialogs provided by medical students. Although we unraveled some limitations regarding the specificity of feedback for certain feedback categories, the overall high agreement with human raters suggests that LLMs can be a valuable tool for medical education. Our findings, thus, advocate the careful integration of AI-driven feedback mechanisms in medical training and highlight important aspects when LLMs are used in that context. ",
doi="10.2196/59213",
url="https://mededu.jmir.org/2024/1/e59213"
}


@Article{info:doi/10.2196/52784,
author="Ming, Shuai
and Guo, Qingge
and Cheng, Wenjun
and Lei, Bo",
title="Influence of Model Evolution and System Roles on ChatGPT's Performance in Chinese Medical Licensing Exams: Comparative Study",
journal="JMIR Med Educ",
year="2024",
month="Aug",
day="13",
volume="10",
pages="e52784",
keywords="ChatGPT",
keywords="Chinese National Medical Licensing Examination",
keywords="large language models",
keywords="medical education",
keywords="system role",
keywords="LLM",
keywords="LLMs",
keywords="language model",
keywords="language models",
keywords="artificial intelligence",
keywords="chatbot",
keywords="chatbots",
keywords="conversational agent",
keywords="conversational agents",
keywords="exam",
keywords="exams",
keywords="examination",
keywords="examinations",
keywords="OpenAI",
keywords="answer",
keywords="answers",
keywords="response",
keywords="responses",
keywords="accuracy",
keywords="performance",
keywords="China",
keywords="Chinese",
abstract="Background: With the increasing application of large language models like ChatGPT in various industries, its potential in the medical domain, especially in standardized examinations, has become a focal point of research. Objective: The aim of this study is to assess the clinical performance of ChatGPT, focusing on its accuracy and reliability in the Chinese National Medical Licensing Examination (CNMLE). Methods: The CNMLE 2022 question set, consisting of 500 single-answer multiple choices questions, were reclassified into 15 medical subspecialties. Each question was tested 8 to 12 times in Chinese on the OpenAI platform from April 24 to May 15, 2023. Three key factors were considered: the version of GPT-3.5 and 4.0, the prompt's designation of system roles tailored to medical subspecialties, and repetition for coherence. A passing accuracy threshold was established as 60\%. The $\chi$2 tests and $\kappa$ values were employed to evaluate the model's accuracy and consistency. Results: GPT-4.0 achieved a passing accuracy of 72.7\%, which was significantly higher than that of GPT-3.5 (54\%; P<.001). The variability rate of repeated responses from GPT-4.0 was lower than that of GPT-3.5 (9\% vs 19.5\%; P<.001). However, both models showed relatively good response coherence, with $\kappa$ values of 0.778 and 0.610, respectively. System roles numerically increased accuracy for both GPT-4.0 (0.3\%?3.7\%) and GPT-3.5 (1.3\%?4.5\%), and reduced variability by 1.7\% and 1.8\%, respectively (P>.05). In subgroup analysis, ChatGPT achieved comparable accuracy among different question types (P>.05). GPT-4.0 surpassed the accuracy threshold in 14 of 15 subspecialties, while GPT-3.5 did so in 7 of 15 on the first response. Conclusions: GPT-4.0 passed the CNMLE and outperformed GPT-3.5 in key areas such as accuracy, consistency, and medical subspecialty expertise. Adding a system role insignificantly enhanced the model's reliability and answer coherence. GPT-4.0 showed promising potential in medical education and clinical practice, meriting further study. ",
doi="10.2196/52784",
url="https://mededu.jmir.org/2024/1/e52784"
}


@Article{info:doi/10.2196/51757,
author="Cherrez-Ojeda, Ivan
and Gallardo-Bastidas, C. Juan
and Robles-Velasco, Karla
and Osorio, F. Mar{\'i}a
and Velez Leon, Maria Eleonor
and Leon Velastegui, Manuel
and Pauletto, Patr{\'i}cia
and Aguilar-D{\'i}az, C. F.
and Squassi, Aldo
and Gonz{\'a}lez Eras, Patricia Susana
and Cordero Carrasco, Erita
and Chavez Gonzalez, Leonor Karol
and Calderon, C. Juan
and Bousquet, Jean
and Bedbrook, Anna
and Faytong-Haro, Marco",
title="Understanding Health Care Students' Perceptions, Beliefs, and Attitudes Toward AI-Powered Language Models: Cross-Sectional Study",
journal="JMIR Med Educ",
year="2024",
month="Aug",
day="13",
volume="10",
pages="e51757",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="education",
keywords="health care",
keywords="students",
abstract="Background: ChatGPT was not intended for use in health care, but it has potential benefits that depend on end-user understanding and acceptability, which is where health care students become crucial. There is still a limited amount of research in this area. Objective: The primary aim of our study was to assess the frequency of ChatGPT use, the perceived level of knowledge, the perceived risks associated with its use, and the ethical issues, as well as attitudes toward the use of ChatGPT in the context of education in the field of health. In addition, we aimed to examine whether there were differences across groups based on demographic variables. The second part of the study aimed to assess the association between the frequency of use, the level of perceived knowledge, the level of risk perception, and the level of perception of ethics as predictive factors for participants' attitudes toward the use of ChatGPT. Methods: A cross-sectional survey was conducted from May to June 2023 encompassing students of medicine, nursing, dentistry, nutrition, and laboratory science across the Americas. The study used descriptive analysis, chi-square tests, and ANOVA to assess statistical significance across different categories. The study used several ordinal logistic regression models to analyze the impact of predictive factors (frequency of use, perception of knowledge, perception of risk, and ethics perception scores) on attitude as the dependent variable. The models were adjusted for gender, institution type, major, and country. Stata was used to conduct all the analyses. Results: Of 2661 health care students, 42.99\% (n=1144) were unaware of ChatGPT. The median score of knowledge was ``minimal'' (median 2.00, IQR 1.00-3.00). Most respondents (median 2.61, IQR 2.11-3.11) regarded ChatGPT as neither ethical nor unethical. Most participants (median 3.89, IQR 3.44-4.34) ``somewhat agreed'' that ChatGPT (1) benefits health care settings, (2) provides trustworthy data, (3) is a helpful tool for clinical and educational medical information access, and (4) makes the work easier. In total, 70\% (7/10) of people used it for homework. As the perceived knowledge of ChatGPT increased, there was a stronger tendency with regard to having a favorable attitude toward ChatGPT. Higher ethical consideration perception ratings increased the likelihood of considering ChatGPT as a source of trustworthy health care information (odds ratio [OR] 1.620, 95\% CI 1.498-1.752), beneficial in medical issues (OR 1.495, 95\% CI 1.452-1.539), and useful for medical literature (OR 1.494, 95\% CI 1.426-1.564; P<.001 for all results). Conclusions: Over 40\% of American health care students (1144/2661, 42.99\%) were unaware of ChatGPT despite its extensive use in the health field. Our data revealed the positive attitudes toward ChatGPT and the desire to learn more about it. Medical educators must explore how chatbots may be included in undergraduate health care education programs. ",
doi="10.2196/51757",
url="https://mededu.jmir.org/2024/1/e51757",
url="http://www.ncbi.nlm.nih.gov/pubmed/39137029"
}


@Article{info:doi/10.2196/59133,
author="Takahashi, Hiromizu
and Shikino, Kiyoshi
and Kondo, Takeshi
and Komori, Akira
and Yamada, Yuji
and Saita, Mizue
and Naito, Toshio",
title="Educational Utility of Clinical Vignettes Generated in Japanese by ChatGPT-4: Mixed Methods Study",
journal="JMIR Med Educ",
year="2024",
month="Aug",
day="13",
volume="10",
pages="e59133",
keywords="generative AI",
keywords="ChatGPT-4",
keywords="medical case generation",
keywords="medical education",
keywords="clinical vignettes",
keywords="AI",
keywords="artificial intelligence",
keywords="Japanese",
keywords="Japan",
abstract="Background: Evaluating the accuracy and educational utility of artificial intelligence--generated medical cases, especially those produced by large language models such as ChatGPT-4 (developed by OpenAI), is crucial yet underexplored. Objective: This study aimed to assess the educational utility of ChatGPT-4--generated clinical vignettes and their applicability in educational settings. Methods: Using a convergent mixed methods design, a web-based survey was conducted from January 8 to 28, 2024, to evaluate 18 medical cases generated by ChatGPT-4 in Japanese. In the survey, 6 main question items were used to evaluate the quality of the generated clinical vignettes and their educational utility, which are information quality, information accuracy, educational usefulness, clinical match, terminology accuracy (TA), and diagnosis difficulty. Feedback was solicited from physicians specializing in general internal medicine or general medicine and experienced in medical education. Chi-square and Mann-Whitney U tests were performed to identify differences among cases, and linear regression was used to examine trends associated with physicians' experience. Thematic analysis of qualitative feedback was performed to identify areas for improvement and confirm the educational utility of the cases. Results: Of the 73 invited participants, 71 (97\%) responded. The respondents, primarily male (64/71, 90\%), spanned a broad range of practice years (from 1976 to 2017) and represented diverse hospital sizes throughout Japan. The majority deemed the information quality (mean 0.77, 95\% CI 0.75-0.79) and information accuracy (mean 0.68, 95\% CI 0.65-0.71) to be satisfactory, with these responses being based on binary data. The average scores assigned were 3.55 (95\% CI 3.49-3.60) for educational usefulness, 3.70 (95\% CI 3.65-3.75) for clinical match, 3.49 (95\% CI 3.44-3.55) for TA, and 2.34 (95\% CI 2.28-2.40) for diagnosis difficulty, based on a 5-point Likert scale. Statistical analysis showed significant variability in content quality and relevance across the cases (P<.001 after Bonferroni correction). Participants suggested improvements in generating physical findings, using natural language, and enhancing medical TA. The thematic analysis highlighted the need for clearer documentation, clinical information consistency, content relevance, and patient-centered case presentations. Conclusions: ChatGPT-4--generated medical cases written in Japanese possess considerable potential as resources in medical education, with recognized adequacy in quality and accuracy. Nevertheless, there is a notable need for enhancements in the precision and realism of case details. This study emphasizes ChatGPT-4's value as an adjunctive educational tool in the medical field, requiring expert oversight for optimal application. ",
doi="10.2196/59133",
url="https://mededu.jmir.org/2024/1/e59133",
url="http://www.ncbi.nlm.nih.gov/pubmed/39137031"
}


@Article{info:doi/10.2196/60083,
author="Zhui, Li
and Fenghe, Li
and Xuehu, Wang
and Qining, Fu
and Wei, Ren",
title="Ethical Considerations and Fundamental Principles of Large Language Models in Medical Education: Viewpoint",
journal="J Med Internet Res",
year="2024",
month="Aug",
day="1",
volume="26",
pages="e60083",
keywords="medical education",
keywords="artificial intelligence",
keywords="large language models",
keywords="medical ethics",
keywords="AI",
keywords="LLMs",
keywords="ethics",
keywords="academic integrity",
keywords="privacy and data risks",
keywords="data security",
keywords="data protection",
keywords="intellectual property rights",
keywords="educational research",
doi="10.2196/60083",
url="https://www.jmir.org/2024/1/e60083",
url="http://www.ncbi.nlm.nih.gov/pubmed/38971715"
}


@Article{info:doi/10.2196/56342,
author="Burke, B. Harry
and Hoang, Albert
and Lopreiato, O. Joseph
and King, Heidi
and Hemmer, Paul
and Montgomery, Michael
and Gagarin, Viktoria",
title="Assessing the Ability of a Large Language Model to Score Free-Text Medical Student Clinical Notes: Quantitative Study",
journal="JMIR Med Educ",
year="2024",
month="Jul",
day="25",
volume="10",
pages="e56342",
keywords="medical education",
keywords="generative artificial intelligence",
keywords="natural language processing",
keywords="ChatGPT",
keywords="generative pretrained transformer",
keywords="standardized patients",
keywords="clinical notes",
keywords="free-text notes",
keywords="history and physical examination",
keywords="large language model",
keywords="LLM",
keywords="medical student",
keywords="medical students",
keywords="clinical information",
keywords="artificial intelligence",
keywords="AI",
keywords="patients",
keywords="patient",
keywords="medicine",
abstract="Background: Teaching medical students the skills required to acquire, interpret, apply, and communicate clinical information is an integral part of medical education. A crucial aspect of this process involves providing students with feedback regarding the quality of their free-text clinical notes. Objective: The goal of this study was to assess the ability of ChatGPT 3.5, a large language model, to score medical students' free-text history and physical notes. Methods: This is a single-institution, retrospective study. Standardized patients learned a prespecified clinical case and, acting as the patient, interacted with medical students. Each student wrote a free-text history and physical note of their interaction. The students' notes were scored independently by the standardized patients and ChatGPT using a prespecified scoring rubric that consisted of 85 case elements. The measure of accuracy was percent correct. Results: The study population consisted of 168 first-year medical students. There was a total of 14,280 scores. The ChatGPT incorrect scoring rate was 1.0\%, and the standardized patient incorrect scoring rate was 7.2\%. The ChatGPT error rate was 86\%, lower than the standardized patient error rate. The ChatGPT mean incorrect scoring rate of 12 (SD 11) was significantly lower than the standardized patient mean incorrect scoring rate of 85 (SD 74; P=.002). Conclusions: ChatGPT demonstrated a significantly lower error rate compared to standardized patients. This is the first study to assess the ability of a generative pretrained transformer (GPT) program to score medical students' standardized patient-based free-text clinical notes. It is expected that, in the near future, large language models will provide real-time feedback to practicing physicians regarding their free-text notes. GPT artificial intelligence programs represent an important advance in medical education and medical practice. ",
doi="10.2196/56342",
url="https://mededu.jmir.org/2024/1/e56342"
}


@Article{info:doi/10.2196/58396,
author="Kamel Boulos, N. Maged
and Dellavalle, Robert",
title="NVIDIA's ``Chat with RTX'' Custom Large Language Model and Personalized AI Chatbot Augments the Value of Electronic Dermatology Reference Material",
journal="JMIR Dermatol",
year="2024",
month="Jul",
day="24",
volume="7",
pages="e58396",
keywords="AI chatbots",
keywords="artificial intelligence",
keywords="AI",
keywords="generative AI",
keywords="large language models",
keywords="dermatology",
keywords="education",
keywords="self-study",
keywords="NVIDIA RTX",
keywords="retrieval-augmented generation",
keywords="RAG",
doi="10.2196/58396",
url="https://derma.jmir.org/2024/1/e58396"
}


@Article{info:doi/10.2196/52818,
author="Cherif, Hela
and Moussa, Chirine
and Missaoui, Mouhaymen Abdel
and Salouage, Issam
and Mokaddem, Salma
and Dhahri, Besma",
title="Appraisal of ChatGPT's Aptitude for Medical Education: Comparative Analysis With Third-Year Medical Students in a Pulmonology Examination",
journal="JMIR Med Educ",
year="2024",
month="Jul",
day="23",
volume="10",
pages="e52818",
keywords="medical education",
keywords="ChatGPT",
keywords="GPT",
keywords="artificial intelligence",
keywords="natural language processing",
keywords="NLP",
keywords="pulmonary medicine",
keywords="pulmonary",
keywords="lung",
keywords="lungs",
keywords="respiratory",
keywords="respiration",
keywords="pneumology",
keywords="comparative analysis",
keywords="large language models",
keywords="LLMs",
keywords="LLM",
keywords="language model",
keywords="generative AI",
keywords="generative artificial intelligence",
keywords="generative",
keywords="exams",
keywords="exam",
keywords="examinations",
keywords="examination",
abstract="Background: The rapid evolution of ChatGPT has generated substantial interest and led to extensive discussions in both public and academic domains, particularly in the context of medical education. Objective: This study aimed to evaluate ChatGPT's performance in a pulmonology examination through a comparative analysis with that of third-year medical students. Methods: In this cross-sectional study, we conducted a comparative analysis with 2 distinct groups. The first group comprised 244 third-year medical students who had previously taken our institution's 2020 pulmonology examination, which was conducted in French. The second group involved ChatGPT-3.5 in 2 separate sets of conversations: without contextualization (V1) and with contextualization (V2). In both V1 and V2, ChatGPT received the same set of questions administered to the students. Results: V1 demonstrated exceptional proficiency in radiology, microbiology, and thoracic surgery, surpassing the majority of medical students in these domains. However, it faced challenges in pathology, pharmacology, and clinical pneumology. In contrast, V2 consistently delivered more accurate responses across various question categories, regardless of the specialization. ChatGPT exhibited suboptimal performance in multiple choice questions compared to medical students. V2 excelled in responding to structured open-ended questions. Both ChatGPT conversations, particularly V2, outperformed students in addressing questions of low and intermediate difficulty. Interestingly, students showcased enhanced proficiency when confronted with highly challenging questions. V1 fell short of passing the examination. Conversely, V2 successfully achieved examination success, outperforming 139 (62.1\%) medical students. Conclusions: While ChatGPT has access to a comprehensive web-based data set, its performance closely mirrors that of an average medical student. Outcomes are influenced by question format, item complexity, and contextual nuances. The model faces challenges in medical contexts requiring information synthesis, advanced analytical aptitude, and clinical judgment, as well as in non-English language assessments and when confronted with data outside mainstream internet sources. ",
doi="10.2196/52818",
url="https://mededu.jmir.org/2024/1/e52818"
}


@Article{info:doi/10.2196/51346,
author="Skryd, Anthony
and Lawrence, Katharine",
title="ChatGPT as a Tool for Medical Education and Clinical Decision-Making on the Wards: Case Study",
journal="JMIR Form Res",
year="2024",
month="May",
day="8",
volume="8",
pages="e51346",
keywords="ChatGPT",
keywords="medical education",
keywords="large language models",
keywords="LLMs",
keywords="clinical decision-making",
abstract="Background: Large language models (LLMs) are computational artificial intelligence systems with advanced natural language processing capabilities that have recently been popularized among health care students and educators due to their ability to provide real-time access to a vast amount of medical knowledge. The adoption of LLM technology into medical education and training has varied, and little empirical evidence exists to support its use in clinical teaching environments. Objective: The aim of the study is to identify and qualitatively evaluate potential use cases and limitations of LLM technology for real-time ward-based educational contexts. Methods: A brief, single-site exploratory evaluation of the publicly available ChatGPT-3.5 (OpenAI) was conducted by implementing the tool into the daily attending rounds of a general internal medicine inpatient service at a large urban academic medical center. ChatGPT was integrated into rounds via both structured and organic use, using the web-based ``chatbot'' style interface to interact with the LLM through conversational free-text and discrete queries. A qualitative approach using phenomenological inquiry was used to identify key insights related to the use of ChatGPT through analysis of ChatGPT conversation logs and associated shorthand notes from the clinical sessions. Results: Identified use cases for ChatGPT integration included addressing medical knowledge gaps through discrete medical knowledge inquiries, building differential diagnoses and engaging dual-process thinking, challenging medical axioms, using cognitive aids to support acute care decision-making, and improving complex care management by facilitating conversations with subspecialties. Potential additional uses included engaging in difficult conversations with patients, exploring ethical challenges and general medical ethics teaching, personal continuing medical education resources, developing ward-based teaching tools, supporting and automating clinical documentation, and supporting productivity and task management. LLM biases, misinformation, ethics, and health equity were identified as areas of concern and potential limitations to clinical and training use. A code of conduct on ethical and appropriate use was also developed to guide team usage on the wards. Conclusions: Overall, ChatGPT offers a novel tool to enhance ward-based learning through rapid information querying, second-order content exploration, and engaged team discussion regarding generated responses. More research is needed to fully understand contexts for educational use, particularly regarding the risks and limitations of the tool in clinical settings and its impacts on trainee development. ",
doi="10.2196/51346",
url="https://formative.jmir.org/2024/1/e51346",
url="http://www.ncbi.nlm.nih.gov/pubmed/38717811"
}


@Article{info:doi/10.2196/55048,
author="Rojas, Marcos
and Rojas, Marcelo
and Burgess, Valentina
and Toro-P{\'e}rez, Javier
and Salehi, Shima",
title="Exploring the Performance of ChatGPT Versions 3.5, 4, and 4 With Vision in the Chilean Medical Licensing Examination: Observational Study",
journal="JMIR Med Educ",
year="2024",
month="Apr",
day="29",
volume="10",
pages="e55048",
keywords="artificial intelligence",
keywords="AI",
keywords="generative artificial intelligence",
keywords="medical education",
keywords="ChatGPT",
keywords="EUNACOM",
keywords="medical licensure",
keywords="medical license",
keywords="medical licensing exam",
abstract="Background: The deployment of OpenAI's ChatGPT-3.5 and its subsequent versions, ChatGPT-4 and ChatGPT-4 With Vision (4V; also known as ``GPT-4 Turbo With Vision''), has notably influenced the medical field. Having demonstrated remarkable performance in medical examinations globally, these models show potential for educational applications. However, their effectiveness in non-English contexts, particularly in Chile's medical licensing examinations---a critical step for medical practitioners in Chile---is less explored. This gap highlights the need to evaluate ChatGPT's adaptability to diverse linguistic and cultural contexts. Objective: This study aims to evaluate the performance of ChatGPT versions 3.5, 4, and 4V in the EUNACOM (Examen {\'U}nico Nacional de Conocimientos de Medicina), a major medical examination in Chile. Methods: Three official practice drills (540 questions) from the University of Chile, mirroring the EUNACOM's structure and difficulty, were used to test ChatGPT versions 3.5, 4, and 4V. The 3 ChatGPT versions were provided 3 attempts for each drill. Responses to questions during each attempt were systematically categorized and analyzed to assess their accuracy rate. Results: All versions of ChatGPT passed the EUNACOM drills. Specifically, versions 4 and 4V outperformed version 3.5, achieving average accuracy rates of 79.32\% and 78.83\%, respectively, compared to 57.53\% for version 3.5 (P<.001). Version 4V, however, did not outperform version 4 (P=.73), despite the additional visual capabilities. We also evaluated ChatGPT's performance in different medical areas of the EUNACOM and found that versions 4 and 4V consistently outperformed version 3.5. Across the different medical areas, version 3.5 displayed the highest accuracy in psychiatry (69.84\%), while versions 4 and 4V achieved the highest accuracy in surgery (90.00\% and 86.11\%, respectively). Versions 3.5 and 4 had the lowest performance in internal medicine (52.74\% and 75.62\%, respectively), while version 4V had the lowest performance in public health (74.07\%). Conclusions: This study reveals ChatGPT's ability to pass the EUNACOM, with distinct proficiencies across versions 3.5, 4, and 4V. Notably, advancements in artificial intelligence (AI) have not significantly led to enhancements in performance on image-based questions. The variations in proficiency across medical fields suggest the need for more nuanced AI training. Additionally, the study underscores the importance of exploring innovative approaches to using AI to augment human cognition and enhance the learning process. Such advancements have the potential to significantly influence medical education, fostering not only knowledge acquisition but also the development of critical thinking and problem-solving skills among health care professionals. ",
doi="10.2196/55048",
url="https://mededu.jmir.org/2024/1/e55048"
}


@Article{info:doi/10.2196/57054,
author="Noda, Masao
and Ueno, Takayoshi
and Koshu, Ryota
and Takaso, Yuji
and Shimada, Dias Mari
and Saito, Chizu
and Sugimoto, Hisashi
and Fushiki, Hiroaki
and Ito, Makoto
and Nomura, Akihiro
and Yoshizaki, Tomokazu",
title="Performance of GPT-4V in Answering the Japanese Otolaryngology Board Certification Examination Questions: Evaluation Study",
journal="JMIR Med Educ",
year="2024",
month="Mar",
day="28",
volume="10",
pages="e57054",
keywords="artificial intelligence",
keywords="GPT-4v",
keywords="large language model",
keywords="otolaryngology",
keywords="GPT",
keywords="ChatGPT",
keywords="LLM",
keywords="LLMs",
keywords="language model",
keywords="language models",
keywords="head",
keywords="respiratory",
keywords="ENT: ear",
keywords="nose",
keywords="throat",
keywords="neck",
keywords="NLP",
keywords="natural language processing",
keywords="image",
keywords="images",
keywords="exam",
keywords="exams",
keywords="examination",
keywords="examinations",
keywords="answer",
keywords="answers",
keywords="answering",
keywords="response",
keywords="responses",
abstract="Background: Artificial intelligence models can learn from medical literature and clinical cases and generate answers that rival human experts. However, challenges remain in the analysis of complex data containing images and diagrams. Objective: This study aims to assess the answering capabilities and accuracy of ChatGPT-4 Vision (GPT-4V) for a set of 100 questions, including image-based questions, from the 2023 otolaryngology board certification examination. Methods: Answers to 100 questions from the 2023 otolaryngology board certification examination, including image-based questions, were generated using GPT-4V. The accuracy rate was evaluated using different prompts, and the presence of images, clinical area of the questions, and variations in the answer content were examined. Results: The accuracy rate for text-only input was, on average, 24.7\% but improved to 47.3\% with the addition of English translation and prompts (P<.001). The average nonresponse rate for text-only input was 46.3\%; this decreased to 2.7\% with the addition of English translation and prompts (P<.001). The accuracy rate was lower for image-based questions than for text-only questions across all types of input, with a relatively high nonresponse rate. General questions and questions from the fields of head and neck allergies and nasal allergies had relatively high accuracy rates, which increased with the addition of translation and prompts. In terms of content, questions related to anatomy had the highest accuracy rate. For all content types, the addition of translation and prompts increased the accuracy rate. As for the performance based on image-based questions, the average of correct answer rate with text-only input was 30.4\%, and that with text-plus-image input was 41.3\% (P=.02). Conclusions: Examination of artificial intelligence's answering capabilities for the otolaryngology board certification examination improves our understanding of its potential and limitations in this field. Although the improvement was noted with the addition of translation and prompts, the accuracy rate for image-based questions was lower than that for text-based questions, suggesting room for improvement in GPT-4V at this stage. Furthermore, text-plus-image input answers a higher rate in image-based questions. Our findings imply the usefulness and potential of GPT-4V in medicine; however, future consideration of safe use methods is needed. ",
doi="10.2196/57054",
url="https://mededu.jmir.org/2024/1/e57054",
url="http://www.ncbi.nlm.nih.gov/pubmed/38546736"
}


@Article{info:doi/10.2196/49964,
author="Gandhi, P. Aravind
and Joesph, Karen Felista
and Rajagopal, Vineeth
and Aparnavi, P.
and Katkuri, Sushma
and Dayama, Sonal
and Satapathy, Prakasini
and Khatib, Nazli Mahalaqua
and Gaidhane, Shilpa
and Zahiruddin, Syed Quazi
and Behera, Ashish",
title="Performance of ChatGPT on the India Undergraduate Community Medicine Examination: Cross-Sectional Study",
journal="JMIR Form Res",
year="2024",
month="Mar",
day="25",
volume="8",
pages="e49964",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="community medicine",
keywords="India",
keywords="large language model",
keywords="medical education",
keywords="digitalization",
abstract="Background: Medical students may increasingly use large language models (LLMs) in their learning. ChatGPT is an LLM at the forefront of this new development in medical education with the capacity to respond to multidisciplinary questions. Objective: The aim of this study was to evaluate the ability of ChatGPT 3.5 to complete the Indian undergraduate medical examination in the subject of community medicine. We further compared ChatGPT scores with the scores obtained by the students. Methods: The study was conducted at a publicly funded medical college in Hyderabad, India. The study was based on the internal assessment examination conducted in January 2023 for students in the Bachelor of Medicine and Bachelor of Surgery Final Year--Part I program; the examination of focus included 40 questions (divided between two papers) from the community medicine subject syllabus. Each paper had three sections with different weightage of marks for each section: section one had two long essay--type questions worth 15 marks each, section two had 8 short essay--type questions worth 5 marks each, and section three had 10 short-answer questions worth 3 marks each. The same questions were administered as prompts to ChatGPT 3.5 and the responses were recorded. Apart from scoring ChatGPT responses, two independent evaluators explored the responses to each question to further analyze their quality with regard to three subdomains: relevancy, coherence, and completeness. Each question was scored in these subdomains on a Likert scale of 1-5. The average of the two evaluators was taken as the subdomain score of the question. The proportion of questions with a score 50\% of the maximum score (5) in each subdomain was calculated. Results: ChatGPT 3.5 scored 72.3\% on paper 1 and 61\% on paper 2. The mean score of the 94 students was 43\% on paper 1 and 45\% on paper 2. The responses of ChatGPT 3.5 were also rated to be satisfactorily relevant, coherent, and complete for most of the questions (>80\%). Conclusions: ChatGPT 3.5 appears to have substantial and sufficient knowledge to understand and answer the Indian medical undergraduate examination in the subject of community medicine. ChatGPT may be introduced to students to enable the self-directed learning of community medicine in pilot mode. However, faculty oversight will be required as ChatGPT is still in the initial stages of development, and thus its potential and reliability of medical content from the Indian context need to be further explored comprehensively. ",
doi="10.2196/49964",
url="https://formative.jmir.org/2024/1/e49964",
url="http://www.ncbi.nlm.nih.gov/pubmed/38526538"
}


@Article{info:doi/10.2196/51151,
author="Magalh{\~a}es Araujo, Sabrina
and Cruz-Correia, Ricardo",
title="Incorporating ChatGPT in Medical Informatics Education: Mixed Methods Study on Student Perceptions and Experiential Integration Proposals",
journal="JMIR Med Educ",
year="2024",
month="Mar",
day="20",
volume="10",
pages="e51151",
keywords="education",
keywords="medical informatics",
keywords="artificial intelligence",
keywords="AI",
keywords="generative language model",
keywords="ChatGPT",
abstract="Background: The integration of artificial intelligence (AI) technologies, such as ChatGPT, in the educational landscape has the potential to enhance the learning experience of medical informatics students and prepare them for using AI in professional settings. The incorporation of AI in classes aims to develop critical thinking by encouraging students to interact with ChatGPT and critically analyze the responses generated by the chatbot. This approach also helps students develop important skills in the field of biomedical and health informatics to enhance their interaction with AI tools. Objective: The aim of the study is to explore the perceptions of students regarding the use of ChatGPT as a learning tool in their educational context and provide professors with examples of prompts for incorporating ChatGPT into their teaching and learning activities, thereby enhancing the educational experience for students in medical informatics courses. Methods: This study used a mixed methods approach to gain insights from students regarding the use of ChatGPT in education. To accomplish this, a structured questionnaire was applied to evaluate students' familiarity with ChatGPT, gauge their perceptions of its use, and understand their attitudes toward its use in academic and learning tasks. Learning outcomes of 2 courses were analyzed to propose ChatGPT's incorporation in master's programs in medicine and medical informatics. Results: The majority of students expressed satisfaction with the use of ChatGPT in education, finding it beneficial for various purposes, including generating academic content, brainstorming ideas, and rewriting text. While some participants raised concerns about potential biases and the need for informed use, the overall perception was positive. Additionally, the study proposed integrating ChatGPT into 2 specific courses in the master's programs in medicine and medical informatics. The incorporation of ChatGPT was envisioned to enhance student learning experiences and assist in project planning, programming code generation, examination preparation, workflow exploration, and technical interview preparation, thus advancing medical informatics education. In medical teaching, it will be used as an assistant for simplifying the explanation of concepts and solving complex problems, as well as for generating clinical narratives and patient simulators. Conclusions: The study's valuable insights into medical faculty students' perspectives and integration proposals for ChatGPT serve as an informative guide for professors aiming to enhance medical informatics education. The research delves into the potential of ChatGPT, emphasizes the necessity of collaboration in academic environments, identifies subject areas with discernible benefits, and underscores its transformative role in fostering innovative and engaging learning experiences. The envisaged proposals hold promise in empowering future health care professionals to work in the rapidly evolving era of digital health care. ",
doi="10.2196/51151",
url="https://mededu.jmir.org/2024/1/e51151",
url="http://www.ncbi.nlm.nih.gov/pubmed/38506920"
}


@Article{info:doi/10.2196/54393,
author="Nakao, Takahiro
and Miki, Soichiro
and Nakamura, Yuta
and Kikuchi, Tomohiro
and Nomura, Yukihiro
and Hanaoka, Shouhei
and Yoshikawa, Takeharu
and Abe, Osamu",
title="Capability of GPT-4V(ision) in the Japanese National Medical Licensing Examination: Evaluation Study",
journal="JMIR Med Educ",
year="2024",
month="Mar",
day="12",
volume="10",
pages="e54393",
keywords="AI",
keywords="artificial intelligence",
keywords="LLM",
keywords="large language model",
keywords="language model",
keywords="language models",
keywords="ChatGPT",
keywords="GPT-4",
keywords="GPT-4V",
keywords="generative pretrained transformer",
keywords="image",
keywords="images",
keywords="imaging",
keywords="response",
keywords="responses",
keywords="exam",
keywords="examination",
keywords="exams",
keywords="examinations",
keywords="answer",
keywords="answers",
keywords="NLP",
keywords="natural language processing",
keywords="chatbot",
keywords="chatbots",
keywords="conversational agent",
keywords="conversational agents",
keywords="medical education",
abstract="Background: Previous research applying large language models (LLMs) to medicine was focused on text-based information. Recently, multimodal variants of LLMs acquired the capability of recognizing images. Objective: We aim to evaluate the image recognition capability of generative pretrained transformer (GPT)-4V, a recent multimodal LLM developed by OpenAI, in the medical field by testing how visual information affects its performance to answer questions in the 117th Japanese National Medical Licensing Examination. Methods: We focused on 108 questions that had 1 or more images as part of a question and presented GPT-4V with the same questions under two conditions: (1) with both the question text and associated images and (2) with the question text only. We then compared the difference in accuracy between the 2 conditions using the exact McNemar test. Results: Among the 108 questions with images, GPT-4V's accuracy was 68\% (73/108) when presented with images and 72\% (78/108) when presented without images (P=.36). For the 2 question categories, clinical and general, the accuracies with and those without images were 71\% (70/98) versus 78\% (76/98; P=.21) and 30\% (3/10) versus 20\% (2/10; P?.99), respectively. Conclusions: The additional information from the images did not significantly improve the performance of GPT-4V in the Japanese National Medical Licensing Examination. ",
doi="10.2196/54393",
url="https://mededu.jmir.org/2024/1/e54393",
url="http://www.ncbi.nlm.nih.gov/pubmed/38470459"
}


@Article{info:doi/10.2196/51426,
author="Willms, Amanda
and Liu, Sam",
title="Exploring the Feasibility of Using ChatGPT to Create Just-in-Time Adaptive Physical Activity mHealth Intervention Content: Case Study",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="29",
volume="10",
pages="e51426",
keywords="ChatGPT",
keywords="digital health",
keywords="mobile health",
keywords="mHealth",
keywords="physical activity",
keywords="application",
keywords="mobile app",
keywords="mobile apps",
keywords="content creation",
keywords="behavior change",
keywords="app design",
abstract="Background: Achieving physical activity (PA) guidelines' recommendation of 150 minutes of moderate-to-vigorous PA per week has been shown to reduce the risk of many chronic conditions. Despite the overwhelming evidence in this field, PA levels remain low globally. By creating engaging mobile health (mHealth) interventions through strategies such as just-in-time adaptive interventions (JITAIs) that are tailored to an individual's dynamic state, there is potential to increase PA levels. However, generating personalized content can take a long time due to various versions of content required for the personalization algorithms. ChatGPT presents an incredible opportunity to rapidly produce tailored content; however, there is a lack of studies exploring its feasibility. Objective: This study aimed to (1) explore the feasibility of using ChatGPT to create content for a PA JITAI mobile app and (2) describe lessons learned and future recommendations for using ChatGPT in the development of mHealth JITAI content. Methods: During phase 1, we used Pathverse, a no-code app builder, and ChatGPT to develop a JITAI app to help parents support their child's PA levels. The intervention was developed based on the Multi-Process Action Control (M-PAC) framework, and the necessary behavior change techniques targeting the M-PAC constructs were implemented in the app design to help parents support their child's PA. The acceptability of using ChatGPT for this purpose was discussed to determine its feasibility. In phase 2, we summarized the lessons we learned during the JITAI content development process using ChatGPT and generated recommendations to inform future similar use cases. Results: In phase 1, by using specific prompts, we efficiently generated content for 13 lessons relating to increasing parental support for their child's PA following the M-PAC framework. It was determined that using ChatGPT for this case study to develop PA content for a JITAI was acceptable. In phase 2, we summarized our recommendations into the following six steps when using ChatGPT to create content for mHealth behavior interventions: (1) determine target behavior, (2) ground the intervention in behavior change theory, (3) design the intervention structure, (4) input intervention structure and behavior change constructs into ChatGPT, (5) revise the ChatGPT response, and (6) customize the response to be used in the intervention. Conclusions: ChatGPT offers a remarkable opportunity for rapid content creation in the context of an mHealth JITAI. Although our case study demonstrated that ChatGPT was acceptable, it is essential to approach its use, along with other language models, with caution. Before delivering content to population groups, expert review is crucial to ensure accuracy and relevancy. Future research and application of these guidelines are imperative as we deepen our understanding of ChatGPT and its interactions with human input. ",
doi="10.2196/51426",
url="https://mededu.jmir.org/2024/1/e51426",
url="http://www.ncbi.nlm.nih.gov/pubmed/38421689"
}


@Article{info:doi/10.2196/48989,
author="Chen, Chih-Wei
and Walter, Paul
and Wei, Cheng-Chung James",
title="Using ChatGPT-Like Solutions to Bridge the Communication Gap Between Patients With Rheumatoid Arthritis and Health Care Professionals",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="27",
volume="10",
pages="e48989",
keywords="rheumatoid arthritis",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="communication gap",
keywords="privacy",
keywords="data management",
doi="10.2196/48989",
url="https://mededu.jmir.org/2024/1/e48989",
url="http://www.ncbi.nlm.nih.gov/pubmed/38412022"
}


@Article{info:doi/10.2196/51523,
author="Farhat, Faiza
and Chaudhry, Moalla Beenish
and Nadeem, Mohammad
and Sohail, Saquib Shahab
and Madsen, {\O}ivind Dag",
title="Evaluating Large Language Models for the National Premedical Exam in India: Comparative Analysis of GPT-3.5, GPT-4, and Bard",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="21",
volume="10",
pages="e51523",
keywords="accuracy",
keywords="AI model",
keywords="artificial intelligence",
keywords="Bard",
keywords="ChatGPT",
keywords="educational task",
keywords="GPT-4",
keywords="Generative Pre-trained Transformers",
keywords="large language models",
keywords="medical education, medical exam",
keywords="natural language processing",
keywords="performance",
keywords="premedical exams",
keywords="suitability",
abstract="Background: Large language models (LLMs) have revolutionized natural language processing with their ability to generate human-like text through extensive training on large data sets. These models, including Generative Pre-trained Transformers (GPT)-3.5 (OpenAI), GPT-4 (OpenAI), and Bard (Google LLC), find applications beyond natural language processing, attracting interest from academia and industry. Students are actively leveraging LLMs to enhance learning experiences and prepare for high-stakes exams, such as the National Eligibility cum Entrance Test (NEET) in India. Objective: This comparative analysis aims to evaluate the performance of GPT-3.5, GPT-4, and Bard in answering NEET-2023 questions. Methods: In this paper, we evaluated the performance of the 3 mainstream LLMs, namely GPT-3.5, GPT-4, and Google Bard, in answering questions related to the NEET-2023 exam. The questions of the NEET were provided to these artificial intelligence models, and the responses were recorded and compared against the correct answers from the official answer key. Consensus was used to evaluate the performance of all 3 models. Results: It was evident that GPT-4 passed the entrance test with flying colors (300/700, 42.9\%), showcasing exceptional performance. On the other hand, GPT-3.5 managed to meet the qualifying criteria, but with a substantially lower score (145/700, 20.7\%). However, Bard (115/700, 16.4\%) failed to meet the qualifying criteria and did not pass the test. GPT-4 demonstrated consistent superiority over Bard and GPT-3.5 in all 3 subjects. Specifically, GPT-4 achieved accuracy rates of 73\% (29/40) in physics, 44\% (16/36) in chemistry, and 51\% (50/99) in biology. Conversely, GPT-3.5 attained an accuracy rate of 45\% (18/40) in physics, 33\% (13/26) in chemistry, and 34\% (34/99) in biology. The accuracy consensus metric showed that the matching responses between GPT-4 and Bard, as well as GPT-4 and GPT-3.5, had higher incidences of being correct, at 0.56 and 0.57, respectively, compared to the matching responses between Bard and GPT-3.5, which stood at 0.42. When all 3 models were considered together, their matching responses reached the highest accuracy consensus of 0.59. Conclusions: The study's findings provide valuable insights into the performance of GPT-3.5, GPT-4, and Bard in answering NEET-2023 questions. GPT-4 emerged as the most accurate model, highlighting its potential for educational applications. Cross-checking responses across models may result in confusion as the compared models (as duos or a trio) tend to agree on only a little over half of the correct responses. Using GPT-4 as one of the compared models will result in higher accuracy consensus. The results underscore the suitability of LLMs for high-stakes exams and their positive impact on education. Additionally, the study establishes a benchmark for evaluating and enhancing LLMs' performance in educational tasks, promoting responsible and informed use of these models in diverse learning environments. ",
doi="10.2196/51523",
url="https://mededu.jmir.org/2024/1/e51523",
url="http://www.ncbi.nlm.nih.gov/pubmed/38381486"
}


@Article{info:doi/10.2196/51391,
author="Abdullahi, Tassallah
and Singh, Ritambhara
and Eickhoff, Carsten",
title="Learning to Make Rare and Complex Diagnoses With Generative AI Assistance: Qualitative Study of Popular Large Language Models",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="13",
volume="10",
pages="e51391",
keywords="clinical decision support",
keywords="rare diseases",
keywords="complex diseases",
keywords="prompt engineering",
keywords="reliability",
keywords="consistency",
keywords="natural language processing",
keywords="language model",
keywords="Bard",
keywords="ChatGPT 3.5",
keywords="GPT-4",
keywords="MedAlpaca",
keywords="medical education",
keywords="complex diagnosis",
keywords="artificial intelligence",
keywords="AI assistance",
keywords="medical training",
keywords="prediction model",
abstract="Background: Patients with rare and complex diseases often experience delayed diagnoses and misdiagnoses because comprehensive knowledge about these diseases is limited to only a few medical experts. In this context, large language models (LLMs) have emerged as powerful knowledge aggregation tools with applications in clinical decision support and education domains. Objective: This study aims to explore the potential of 3 popular LLMs, namely Bard (Google LLC), ChatGPT-3.5 (OpenAI), and GPT-4 (OpenAI), in medical education to enhance the diagnosis of rare and complex diseases while investigating the impact of prompt engineering on their performance. Methods: We conducted experiments on publicly available complex and rare cases to achieve these objectives. We implemented various prompt strategies to evaluate the performance of these models using both open-ended and multiple-choice prompts. In addition, we used a majority voting strategy to leverage diverse reasoning paths within language models, aiming to enhance their reliability. Furthermore, we compared their performance with the performance of human respondents and MedAlpaca, a generative LLM specifically designed for medical tasks. Results: Notably, all LLMs outperformed the average human consensus and MedAlpaca, with a minimum margin of 5\% and 13\%, respectively, across all 30 cases from the diagnostic case challenge collection. On the frequently misdiagnosed cases category, Bard tied with MedAlpaca but surpassed the human average consensus by 14\%, whereas GPT-4 and ChatGPT-3.5 outperformed MedAlpaca and the human respondents on the moderately often misdiagnosed cases category with minimum accuracy scores of 28\% and 11\%, respectively. The majority voting strategy, particularly with GPT-4, demonstrated the highest overall score across all cases from the diagnostic complex case collection, surpassing that of other LLMs. On the Medical Information Mart for Intensive Care-III data sets, Bard and GPT-4 achieved the highest diagnostic accuracy scores, with multiple-choice prompts scoring 93\%, whereas ChatGPT-3.5 and MedAlpaca scored 73\% and 47\%, respectively. Furthermore, our results demonstrate that there is no one-size-fits-all prompting approach for improving the performance of LLMs and that a single strategy does not universally apply to all LLMs. Conclusions: Our findings shed light on the diagnostic capabilities of LLMs and the challenges associated with identifying an optimal prompting strategy that aligns with each language model's characteristics and specific task requirements. The significance of prompt engineering is highlighted, providing valuable insights for researchers and practitioners who use these language models for medical training. Furthermore, this study represents a crucial step toward understanding how LLMs can enhance diagnostic reasoning in rare and complex medical cases, paving the way for developing effective educational tools and accurate diagnostic aids to improve patient care and outcomes. ",
doi="10.2196/51391",
url="https://mededu.jmir.org/2024/1/e51391",
url="http://www.ncbi.nlm.nih.gov/pubmed/38349725"
}


@Article{info:doi/10.2196/48949,
author="Giunti, Guido
and Doherty, P. Colin",
title="Cocreating an Automated mHealth Apps Systematic Review Process With Generative AI: Design Science Research Approach",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="12",
volume="10",
pages="e48949",
keywords="generative artificial intelligence",
keywords="mHealth",
keywords="ChatGPT",
keywords="evidence-base",
keywords="apps",
keywords="qualitative study",
keywords="design science research",
keywords="eHealth",
keywords="mobile device",
keywords="AI",
keywords="language model",
keywords="mHealth intervention",
keywords="generative AI",
keywords="AI tool",
keywords="software code",
keywords="systematic review",
abstract="Background: The use of mobile devices for delivering health-related services (mobile health [mHealth]) has rapidly increased, leading to a demand for summarizing the state of the art and practice through systematic reviews. However, the systematic review process is a resource-intensive and time-consuming process. Generative artificial intelligence (AI) has emerged as a potential solution to automate tedious tasks. Objective: This study aimed to explore the feasibility of using generative AI tools to automate time-consuming and resource-intensive tasks in a systematic review process and assess the scope and limitations of using such tools. Methods: We used the design science research methodology. The solution proposed is to use cocreation with a generative AI, such as ChatGPT, to produce software code that automates the process of conducting systematic reviews. Results: A triggering prompt was generated, and assistance from the generative AI was used to guide the steps toward developing, executing, and debugging a Python script. Errors in code were solved through conversational exchange with ChatGPT, and a tentative script was created. The code pulled the mHealth solutions from the Google Play Store and searched their descriptions for keywords that hinted toward evidence base. The results were exported to a CSV file, which was compared to the initial outputs of other similar systematic review processes. Conclusions: This study demonstrates the potential of using generative AI to automate the time-consuming process of conducting systematic reviews of mHealth apps. This approach could be particularly useful for researchers with limited coding skills. However, the study has limitations related to the design science research methodology, subjectivity bias, and the quality of the search results used to train the language model. ",
doi="10.2196/48949",
url="https://mededu.jmir.org/2024/1/e48949",
url="http://www.ncbi.nlm.nih.gov/pubmed/38345839"
}


@Article{info:doi/10.2196/48514,
author="Yu, Peng
and Fang, Changchang
and Liu, Xiaolin
and Fu, Wanying
and Ling, Jitao
and Yan, Zhiwei
and Jiang, Yuan
and Cao, Zhengyu
and Wu, Maoxiong
and Chen, Zhiteng
and Zhu, Wengen
and Zhang, Yuling
and Abudukeremu, Ayiguli
and Wang, Yue
and Liu, Xiao
and Wang, Jingfeng",
title="Performance of ChatGPT on the Chinese Postgraduate Examination for Clinical Medicine: Survey Study",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="9",
volume="10",
pages="e48514",
keywords="ChatGPT",
keywords="Chinese Postgraduate Examination for Clinical Medicine",
keywords="medical student",
keywords="performance",
keywords="artificial intelligence",
keywords="medical care",
keywords="qualitative feedback",
keywords="medical education",
keywords="clinical decision-making",
abstract="Background: ChatGPT, an artificial intelligence (AI) based on large-scale language models, has sparked interest in the field of health care. Nonetheless, the capabilities of AI in text comprehension and generation are constrained by the quality and volume of available training data for a specific language, and the performance of AI across different languages requires further investigation. While AI harbors substantial potential in medicine, it is imperative to tackle challenges such as the formulation of clinical care standards; facilitating cultural transitions in medical education and practice; and managing ethical issues including data privacy, consent, and bias. Objective: The study aimed to evaluate ChatGPT's performance in processing Chinese Postgraduate Examination for Clinical Medicine questions, assess its clinical reasoning ability, investigate potential limitations with the Chinese language, and explore its potential as a valuable tool for medical professionals in the Chinese context. Methods: A data set of Chinese Postgraduate Examination for Clinical Medicine questions was used to assess the effectiveness of ChatGPT's (version 3.5) medical knowledge in the Chinese language, which has a data set of 165 medical questions that were divided into three categories: (1) common questions (n=90) assessing basic medical knowledge, (2) case analysis questions (n=45) focusing on clinical decision-making through patient case evaluations, and (3) multichoice questions (n=30) requiring the selection of multiple correct answers. First of all, we assessed whether ChatGPT could meet the stringent cutoff score defined by the government agency, which requires a performance within the top 20\% of candidates. Additionally, in our evaluation of ChatGPT's performance on both original and encoded medical questions, 3 primary indicators were used: accuracy, concordance (which validates the answer), and the frequency of insights. Results: Our evaluation revealed that ChatGPT scored 153.5 out of 300 for original questions in Chinese, which signifies the minimum score set to ensure that at least 20\% more candidates pass than the enrollment quota. However, ChatGPT had low accuracy in answering open-ended medical questions, with only 31.5\% total accuracy. The accuracy for common questions, multichoice questions, and case analysis questions was 42\%, 37\%, and 17\%, respectively. ChatGPT achieved a 90\% concordance across all questions. Among correct responses, the concordance was 100\%, significantly exceeding that of incorrect responses (n=57, 50\%; P<.001). ChatGPT provided innovative insights for 80\% (n=132) of all questions, with an average of 2.95 insights per accurate response. Conclusions: Although ChatGPT surpassed the passing threshold for the Chinese Postgraduate Examination for Clinical Medicine, its performance in answering open-ended medical questions was suboptimal. Nonetheless, ChatGPT exhibited high internal concordance and the ability to generate multiple insights in the Chinese language. Future research should investigate the language-based discrepancies in ChatGPT's performance within the health care context. ",
doi="10.2196/48514",
url="https://mededu.jmir.org/2024/1/e48514",
url="http://www.ncbi.nlm.nih.gov/pubmed/38335017"
}


@Article{info:doi/10.2196/50965,
author="Meyer, Annika
and Riese, Janik
and Streichert, Thomas",
title="Comparison of the Performance of GPT-3.5 and GPT-4 With That of Medical Students on the Written German Medical Licensing Examination: Observational Study",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="8",
volume="10",
pages="e50965",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="large language model",
keywords="medical exams",
keywords="medical examinations",
keywords="medical education",
keywords="LLM",
keywords="public trust",
keywords="trust",
keywords="medical accuracy",
keywords="licensing exam",
keywords="licensing examination",
keywords="improvement",
keywords="patient care",
keywords="general population",
keywords="licensure examination",
abstract="Background: The potential of artificial intelligence (AI)--based large language models, such as ChatGPT, has gained significant attention in the medical field. This enthusiasm is driven not only by recent breakthroughs and improved accessibility, but also by the prospect of democratizing medical knowledge and promoting equitable health care. However, the performance of ChatGPT is substantially influenced by the input language, and given the growing public trust in this AI tool compared to that in traditional sources of information, investigating its medical accuracy across different languages is of particular importance. Objective: This study aimed to compare the performance of GPT-3.5 and GPT-4 with that of medical students on the written German medical licensing examination. Methods: To assess GPT-3.5's and GPT-4's medical proficiency, we used 937 original multiple-choice questions from 3 written German medical licensing examinations in October 2021, April 2022, and October 2022. Results: GPT-4 achieved an average score of 85\% and ranked in the 92.8th, 99.5th, and 92.6th percentiles among medical students who took the same examinations in October 2021, April 2022, and October 2022, respectively. This represents a substantial improvement of 27\% compared to GPT-3.5, which only passed 1 out of the 3 examinations. While GPT-3.5 performed well in psychiatry questions, GPT-4 exhibited strengths in internal medicine and surgery but showed weakness in academic research. Conclusions: The study results highlight ChatGPT's remarkable improvement from moderate (GPT-3.5) to high competency (GPT-4) in answering medical licensing examination questions in German. While GPT-4's predecessor (GPT-3.5) was imprecise and inconsistent, it demonstrates considerable potential to improve medical education and patient care, provided that medically trained users critically evaluate its results. As the replacement of search engines by AI tools seems possible in the future, further studies with nonprofessional questions are needed to assess the safety and accuracy of ChatGPT for the general population. ",
doi="10.2196/50965",
url="https://mededu.jmir.org/2024/1/e50965",
url="http://www.ncbi.nlm.nih.gov/pubmed/38329802"
}


@Article{info:doi/10.2196/50705,
author="Gray, Megan
and Baird, Austin
and Sawyer, Taylor
and James, Jasmine
and DeBroux, Thea
and Bartlett, Michelle
and Krick, Jeanne
and Umoren, Rachel",
title="Increasing Realism and Variety of Virtual Patient Dialogues for Prenatal Counseling Education Through a Novel Application of ChatGPT: Exploratory Observational Study",
journal="JMIR Med Educ",
year="2024",
month="Feb",
day="1",
volume="10",
pages="e50705",
keywords="prenatal counseling",
keywords="virtual health",
keywords="virtual patient",
keywords="simulation",
keywords="neonatology",
keywords="ChatGPT",
keywords="AI",
keywords="artificial intelligence",
abstract="Background: Using virtual patients, facilitated by natural language processing, provides a valuable educational experience for learners. Generating a large, varied sample of realistic and appropriate responses for virtual patients is challenging. Artificial intelligence (AI) programs can be a viable source for these responses, but their utility for this purpose has not been explored. Objective: In this study, we explored the effectiveness of generative AI (ChatGPT) in developing realistic virtual standardized patient dialogues to teach prenatal counseling skills. Methods: ChatGPT was prompted to generate a list of common areas of concern and questions that families expecting preterm delivery at 24 weeks gestation might ask during prenatal counseling. ChatGPT was then prompted to generate 2 role-plays with dialogues between a parent expecting a potential preterm delivery at 24 weeks and their counseling physician using each of the example questions. The prompt was repeated for 2 unique role-plays: one parent was characterized as anxious and the other as having low trust in the medical system. Role-play scripts were exported verbatim and independently reviewed by 2 neonatologists with experience in prenatal counseling, using a scale of 1-5 on realism, appropriateness, and utility for virtual standardized patient responses. Results: ChatGPT generated 7 areas of concern, with 35 example questions used to generate role-plays. The 35 role-play transcripts generated 176 unique parent responses (median 5, IQR 4-6, per role-play) with 268 unique sentences. Expert review identified 117 (65\%) of the 176 responses as indicating an emotion, either directly or indirectly. Approximately half (98/176, 56\%) of the responses had 2 or more sentences, and half (88/176, 50\%) included at least 1 question. More than half (104/176, 58\%) of the responses from role-played parent characters described a feeling, such as being scared, worried, or concerned. The role-plays of parents with low trust in the medical system generated many unique sentences (n=50). Most of the sentences in the responses were found to be reasonably realistic (214/268, 80\%), appropriate for variable prenatal counseling conversation paths (233/268, 87\%), and usable without more than a minimal modification in a virtual patient program (169/268, 63\%). Conclusions: Generative AI programs, such as ChatGPT, may provide a viable source of training materials to expand virtual patient programs, with careful attention to the concerns and questions of patients and families. Given the potential for unrealistic or inappropriate statements and questions, an expert should review AI chat outputs before deploying them in an educational program. ",
doi="10.2196/50705",
url="https://mededu.jmir.org/2024/1/e50705",
url="http://www.ncbi.nlm.nih.gov/pubmed/38300696"
}


@Article{info:doi/10.2196/51344,
author="Kavadella, Argyro
and Dias da Silva, Antonio Marco
and Kaklamanos, G. Eleftherios
and Stamatopoulos, Vasileios
and Giannakopoulos, Kostis",
title="Evaluation of ChatGPT's Real-Life Implementation in Undergraduate Dental Education: Mixed Methods Study",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="31",
volume="10",
pages="e51344",
keywords="ChatGPT",
keywords="large language models",
keywords="LLM",
keywords="natural language processing",
keywords="artificial Intelligence",
keywords="dental education",
keywords="higher education",
keywords="learning assignments",
keywords="dental students",
keywords="AI pedagogy",
keywords="dentistry",
keywords="university",
abstract="Background: The recent artificial intelligence tool ChatGPT seems to offer a range of benefits in academic education while also raising concerns. Relevant literature encompasses issues of plagiarism and academic dishonesty, as well as pedagogy and educational affordances; yet, no real-life implementation of ChatGPT in the educational process has been reported to our knowledge so far. Objective: This mixed methods study aimed to evaluate the implementation of ChatGPT in the educational process, both quantitatively and qualitatively. Methods: In March 2023, a total of 77 second-year dental students of the European University Cyprus were divided into 2 groups and asked to compose a learning assignment on ``Radiation Biology and Radiation Protection in the Dental Office,'' working collaboratively in small subgroups, as part of the educational semester program of the Dentomaxillofacial Radiology module. Careful planning ensured a seamless integration of ChatGPT, addressing potential challenges. One group searched the internet for scientific resources to perform the task and the other group used ChatGPT for this purpose. Both groups developed a PowerPoint (Microsoft Corp) presentation based on their research and presented it in class. The ChatGPT group students additionally registered all interactions with the language model during the prompting process and evaluated the final outcome; they also answered an open-ended evaluation questionnaire, including questions on their learning experience. Finally, all students undertook a knowledge examination on the topic, and the grades between the 2 groups were compared statistically, whereas the free-text comments of the questionnaires were thematically analyzed. Results: Out of the 77 students, 39 were assigned to the ChatGPT group and 38 to the literature research group. Seventy students undertook the multiple choice question knowledge examination, and examination grades ranged from 5 to 10 on the 0-10 grading scale. The Mann-Whitney U test showed that students of the ChatGPT group performed significantly better (P=.045) than students of the literature research group. The evaluation questionnaires revealed the benefits (human-like interface, immediate response, and wide knowledge base), the limitations (need for rephrasing the prompts to get a relevant answer, general content, false citations, and incapability to provide images or videos), and the prospects (in education, clinical practice, continuing education, and research) of ChatGPT. Conclusions: Students using ChatGPT for their learning assignments performed significantly better in the knowledge examination than their fellow students who used the literature research methodology. Students adapted quickly to the technological environment of the language model, recognized its opportunities and limitations, and used it creatively and efficiently. Implications for practice: the study underscores the adaptability of students to technological innovations including ChatGPT and its potential to enhance educational outcomes. Educators should consider integrating ChatGPT into curriculum design; awareness programs are warranted to educate both students and educators about the limitations of ChatGPT, encouraging critical engagement and responsible use. ",
doi="10.2196/51344",
url="https://mededu.jmir.org/2024/1/e51344",
url="http://www.ncbi.nlm.nih.gov/pubmed/38111256"
}


@Article{info:doi/10.2196/50842,
author="Haddad, Firas
and Saade, S. Joanna",
title="Performance of ChatGPT on Ophthalmology-Related Questions Across Various Examination Levels: Observational Study",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="18",
volume="10",
pages="e50842",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="AI",
keywords="board examinations",
keywords="ophthalmology",
keywords="testing",
abstract="Background: ChatGPT and language learning models have gained attention recently for their ability to answer questions on various examinations across various disciplines. The question of whether ChatGPT could be used to aid in medical education is yet to be answered, particularly in the field of ophthalmology. Objective: The aim of this study is to assess the ability of ChatGPT-3.5 (GPT-3.5) and ChatGPT-4.0 (GPT-4.0) to answer ophthalmology-related questions across different levels of ophthalmology training. Methods: Questions from the United States Medical Licensing Examination (USMLE) steps 1 (n=44), 2 (n=60), and 3 (n=28) were extracted from AMBOSS, and 248 questions (64 easy, 122 medium, and 62 difficult questions) were extracted from the book, Ophthalmology Board Review Q\&A, for the Ophthalmic Knowledge Assessment Program and the Board of Ophthalmology (OB) Written Qualifying Examination (WQE). Questions were prompted identically and inputted to GPT-3.5 and GPT-4.0. Results: GPT-3.5 achieved a total of 55\% (n=210) of correct answers, while GPT-4.0 achieved a total of 70\% (n=270) of correct answers. GPT-3.5 answered 75\% (n=33) of questions correctly in USMLE step 1, 73.33\% (n=44) in USMLE step 2, 60.71\% (n=17) in USMLE step 3, and 46.77\% (n=116) in the OB-WQE. GPT-4.0 answered 70.45\% (n=31) of questions correctly in USMLE step 1, 90.32\% (n=56) in USMLE step 2, 96.43\% (n=27) in USMLE step 3, and 62.90\% (n=156) in the OB-WQE. GPT-3.5 performed poorer as examination levels advanced (P<.001), while GPT-4.0 performed better on USMLE steps 2 and 3 and worse on USMLE step 1 and the OB-WQE (P<.001). The coefficient of correlation (r) between ChatGPT answering correctly and human users answering correctly was 0.21 (P=.01) for GPT-3.5 as compared to --0.31 (P<.001) for GPT-4.0. GPT-3.5 performed similarly across difficulty levels, while GPT-4.0 performed more poorly with an increase in the difficulty level. Both GPT models performed significantly better on certain topics than on others. Conclusions: ChatGPT is far from being considered a part of mainstream medical education. Future models with higher accuracy are needed for the platform to be effective in medical education. ",
doi="10.2196/50842",
url="https://mededu.jmir.org/2024/1/e50842",
url="http://www.ncbi.nlm.nih.gov/pubmed/38236632"
}


@Article{info:doi/10.2196/50174,
author="Nguyen, Tina",
title="ChatGPT in Medical Education: A Precursor for Automation Bias?",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="17",
volume="10",
pages="e50174",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="AI",
keywords="medical students",
keywords="residents",
keywords="medical school curriculum",
keywords="medical education",
keywords="automation bias",
keywords="large language models",
keywords="LLMs",
keywords="bias",
doi="10.2196/50174",
url="https://mededu.jmir.org/2024/1/e50174",
url="http://www.ncbi.nlm.nih.gov/pubmed/38231545"
}


@Article{info:doi/10.2196/53961,
author="Holderried, Friederike
and Stegemann--Philipps, Christian
and Herschbach, Lea
and Moldt, Julia-Astrid
and Nevins, Andrew
and Griewatz, Jan
and Holderried, Martin
and Herrmann-Werner, Anne
and Festl-Wietek, Teresa
and Mahling, Moritz",
title="A Generative Pretrained Transformer (GPT)--Powered Chatbot as a Simulated Patient to Practice History Taking: Prospective, Mixed Methods Study",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="16",
volume="10",
pages="e53961",
keywords="simulated patient",
keywords="GPT",
keywords="generative pretrained transformer",
keywords="ChatGPT",
keywords="history taking",
keywords="medical education",
keywords="documentation",
keywords="history",
keywords="simulated",
keywords="simulation",
keywords="simulations",
keywords="NLP",
keywords="natural language processing",
keywords="artificial intelligence",
keywords="interactive",
keywords="chatbot",
keywords="chatbots",
keywords="conversational agent",
keywords="conversational agents",
keywords="answer",
keywords="answers",
keywords="response",
keywords="responses",
keywords="human computer",
keywords="human machine",
keywords="usability",
keywords="satisfaction",
abstract="Background: Communication is a core competency of medical professionals and of utmost importance for patient safety. Although medical curricula emphasize communication training, traditional formats, such as real or simulated patient interactions, can present psychological stress and are limited in repetition. The recent emergence of large language models (LLMs), such as generative pretrained transformer (GPT), offers an opportunity to overcome these restrictions Objective: The aim of this study was to explore the feasibility of a GPT-driven chatbot to practice history taking, one of the core competencies of communication. Methods: We developed an interactive chatbot interface using GPT-3.5 and a specific prompt including a chatbot-optimized illness script and a behavioral component. Following a mixed methods approach, we invited medical students to voluntarily practice history taking. To determine whether GPT provides suitable answers as a simulated patient, the conversations were recorded and analyzed using quantitative and qualitative approaches. We analyzed the extent to which the questions and answers aligned with the provided script, as well as the medical plausibility of the answers. Finally, the students filled out the Chatbot Usability Questionnaire (CUQ). Results: A total of 28 students practiced with our chatbot (mean age 23.4, SD 2.9 years). We recorded a total of 826 question-answer pairs (QAPs), with a median of 27.5 QAPs per conversation and 94.7\% (n=782) pertaining to history taking. When questions were explicitly covered by the script (n=502, 60.3\%), the GPT-provided answers were mostly based on explicit script information (n=471, 94.4\%). For questions not covered by the script (n=195, 23.4\%), the GPT answers used 56.4\% (n=110) fictitious information. Regarding plausibility, 842 (97.9\%) of 860 QAPs were rated as plausible. Of the 14 (2.1\%) implausible answers, GPT provided answers rated as socially desirable, leaving role identity, ignoring script information, illogical reasoning, and calculation error. Despite these results, the CUQ revealed an overall positive user experience (77/100 points). Conclusions: Our data showed that LLMs, such as GPT, can provide a simulated patient experience and yield a good user experience and a majority of plausible answers. Our analysis revealed that GPT-provided answers use either explicit script information or are based on available information, which can be understood as abductive reasoning. Although rare, the GPT-based chatbot provides implausible information in some instances, with the major tendency being socially desirable instead of medically plausible information. ",
doi="10.2196/53961",
url="https://mededu.jmir.org/2024/1/e53961",
url="http://www.ncbi.nlm.nih.gov/pubmed/38227363"
}


@Article{info:doi/10.2196/51388,
author="Kuo, I-Hsien Nicholas
and Perez-Concha, Oscar
and Hanly, Mark
and Mnatzaganian, Emmanuel
and Hao, Brandon
and Di Sipio, Marcus
and Yu, Guolin
and Vanjara, Jash
and Valerie, Cerelia Ivy
and de Oliveira Costa, Juliana
and Churches, Timothy
and Lujic, Sanja
and Hegarty, Jo
and Jorm, Louisa
and Barbieri, Sebastiano",
title="Enriching Data Science and Health Care Education: Application and Impact of Synthetic Data Sets Through the Health Gym Project",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="16",
volume="10",
pages="e51388",
keywords="medical education",
keywords="generative model",
keywords="generative adversarial networks",
keywords="privacy",
keywords="antiretroviral therapy (ART)",
keywords="human immunodeficiency virus (HIV)",
keywords="data science",
keywords="educational purposes",
keywords="accessibility",
keywords="data privacy",
keywords="data sets",
keywords="sepsis",
keywords="hypotension",
keywords="HIV",
keywords="science education",
keywords="health care AI",
doi="10.2196/51388",
url="https://mededu.jmir.org/2024/1/e51388",
url="http://www.ncbi.nlm.nih.gov/pubmed/38227356"
}


@Article{info:doi/10.2196/49970,
author="Long, Cai
and Lowe, Kayle
and Zhang, Jessica
and Santos, dos Andr{\'e}
and Alanazi, Alaa
and O'Brien, Daniel
and Wright, D. Erin
and Cote, David",
title="A Novel Evaluation Model for Assessing ChatGPT on Otolaryngology--Head and Neck Surgery Certification Examinations: Performance Study",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="16",
volume="10",
pages="e49970",
keywords="medical licensing",
keywords="otolaryngology",
keywords="otology",
keywords="laryngology",
keywords="ear",
keywords="nose",
keywords="throat",
keywords="ENT",
keywords="surgery",
keywords="surgical",
keywords="exam",
keywords="exams",
keywords="response",
keywords="responses",
keywords="answer",
keywords="answers",
keywords="chatbot",
keywords="chatbots",
keywords="examination",
keywords="examinations",
keywords="medical education",
keywords="otolaryngology/head and neck surgery",
keywords="OHNS",
keywords="artificial intelligence",
keywords="AI",
keywords="ChatGPT",
keywords="medical examination",
keywords="large language models",
keywords="language model",
keywords="LLM",
keywords="LLMs",
keywords="wide range information",
keywords="patient safety",
keywords="clinical implementation",
keywords="safety",
keywords="machine learning",
keywords="NLP",
keywords="natural language processing",
abstract="Background: ChatGPT is among the most popular large language models (LLMs), exhibiting proficiency in various standardized tests, including multiple-choice medical board examinations. However, its performance on otolaryngology--head and neck surgery (OHNS) certification examinations and open-ended medical board certification examinations has not been reported. Objective: We aimed to evaluate the performance of ChatGPT on OHNS board examinations and propose a novel method to assess an AI model's performance on open-ended medical board examination questions. Methods: Twenty-one open-ended questions were adopted from the Royal College of Physicians and Surgeons of Canada's sample examination to query ChatGPT on April 11, 2023, with and without prompts. A new model, named Concordance, Validity, Safety, Competency (CVSC), was developed to evaluate its performance. Results: In an open-ended question assessment, ChatGPT achieved a passing mark (an average of 75\% across 3 trials) in the attempts and demonstrated higher accuracy with prompts. The model demonstrated high concordance (92.06\%) and satisfactory validity. While demonstrating considerable consistency in regenerating answers, it often provided only partially correct responses. Notably, concerning features such as hallucinations and self-conflicting answers were observed. Conclusions: ChatGPT achieved a passing score in the sample examination and demonstrated the potential to pass the OHNS certification examination of the Royal College of Physicians and Surgeons of Canada. Some concerns remain due to its hallucinations, which could pose risks to patient safety. Further adjustments are necessary to yield safer and more accurate answers for clinical implementation. ",
doi="10.2196/49970",
url="https://mededu.jmir.org/2024/1/e49970",
url="http://www.ncbi.nlm.nih.gov/pubmed/38227351"
}


@Article{info:doi/10.2196/47339,
author="Al-Worafi, Mohammed Yaser
and Goh, Wen Khang
and Hermansyah, Andi
and Tan, Siang Ching
and Ming, Chiau Long",
title="The Use of ChatGPT for Education Modules on Integrated Pharmacotherapy of Infectious Disease: Educators' Perspectives",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="12",
volume="10",
pages="e47339",
keywords="innovation and technology",
keywords="quality education",
keywords="sustainable communities",
keywords="innovation and infrastructure",
keywords="partnerships for the goals",
keywords="sustainable education",
keywords="social justice",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="feasibility",
abstract="Background: Artificial Intelligence (AI) plays an important role in many fields, including medical education, practice, and research. Many medical educators started using ChatGPT at the end of 2022 for many purposes. Objective: The aim of this study was to explore the potential uses, benefits, and risks of using ChatGPT in education modules on integrated pharmacotherapy of infectious disease. Methods: A content analysis was conducted to investigate the applications of ChatGPT in education modules on integrated pharmacotherapy of infectious disease. Questions pertaining to curriculum development, syllabus design, lecture note preparation, and examination construction were posed during data collection. Three experienced professors rated the appropriateness and precision of the answers provided by ChatGPT. The consensus rating was considered. The professors also discussed the prospective applications, benefits, and risks of ChatGPT in this educational setting. Results: ChatGPT demonstrated the ability to contribute to various aspects of curriculum design, with ratings ranging from 50\% to 92\% for appropriateness and accuracy. However, there were limitations and risks associated with its use, including incomplete syllabi, the absence of essential learning objectives, and the inability to design valid questionnaires and qualitative studies. It was suggested that educators use ChatGPT as a resource rather than relying primarily on its output. There are recommendations for effectively incorporating ChatGPT into the curriculum of the education modules on integrated pharmacotherapy of infectious disease. Conclusions: Medical and health sciences educators can use ChatGPT as a guide in many aspects related to the development of the curriculum of the education modules on integrated pharmacotherapy of infectious disease, syllabus design, lecture notes preparation, and examination preparation with caution. ",
doi="10.2196/47339",
url="https://mededu.jmir.org/2024/1/e47339",
url="http://www.ncbi.nlm.nih.gov/pubmed/38214967"
}


@Article{info:doi/10.2196/51308,
author="Zaleski, L. Amanda
and Berkowsky, Rachel
and Craig, Thomas Kelly Jean
and Pescatello, S. Linda",
title="Comprehensiveness, Accuracy, and Readability of Exercise Recommendations Provided by an AI-Based Chatbot: Mixed Methods Study",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="11",
volume="10",
pages="e51308",
keywords="exercise prescription",
keywords="health literacy",
keywords="large language model",
keywords="patient education",
keywords="artificial intelligence",
keywords="AI",
keywords="chatbot",
abstract="Background: Regular physical activity is critical for health and disease prevention. Yet, health care providers and patients face barriers to implement evidence-based lifestyle recommendations. The potential to augment care with the increased availability of artificial intelligence (AI) technologies is limitless; however, the suitability of AI-generated exercise recommendations has yet to be explored. Objective: The purpose of this study was to assess the comprehensiveness, accuracy, and readability of individualized exercise recommendations generated by a novel AI chatbot. Methods: A coding scheme was developed to score AI-generated exercise recommendations across ten categories informed by gold-standard exercise recommendations, including (1) health condition--specific benefits of exercise, (2) exercise preparticipation health screening, (3) frequency, (4) intensity, (5) time, (6) type, (7) volume, (8) progression, (9) special considerations, and (10) references to the primary literature. The AI chatbot was prompted to provide individualized exercise recommendations for 26 clinical populations using an open-source application programming interface. Two independent reviewers coded AI-generated content for each category and calculated comprehensiveness (\%) and factual accuracy (\%) on a scale of 0\%-100\%. Readability was assessed using the Flesch-Kincaid formula. Qualitative analysis identified and categorized themes from AI-generated output. Results: AI-generated exercise recommendations were 41.2\% (107/260) comprehensive and 90.7\% (146/161) accurate, with the majority (8/15, 53\%) of inaccuracy related to the need for exercise preparticipation medical clearance. Average readability level of AI-generated exercise recommendations was at the college level (mean 13.7, SD 1.7), with an average Flesch reading ease score of 31.1 (SD 7.7). Several recurring themes and observations of AI-generated output included concern for liability and safety, preference for aerobic exercise, and potential bias and direct discrimination against certain age-based populations and individuals with disabilities. Conclusions: There were notable gaps in the comprehensiveness, accuracy, and readability of AI-generated exercise recommendations. Exercise and health care professionals should be aware of these limitations when using and endorsing AI-based technologies as a tool to support lifestyle change involving exercise. ",
doi="10.2196/51308",
url="https://mededu.jmir.org/2024/1/e51308",
url="http://www.ncbi.nlm.nih.gov/pubmed/38206661"
}


@Article{info:doi/10.2196/51247,
author="Weidener, Lukas
and Fischer, Michael",
title="Artificial Intelligence in Medicine: Cross-Sectional Study Among Medical Students on Application, Education, and Ethical Aspects",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="5",
volume="10",
pages="e51247",
keywords="artificial intelligence",
keywords="AI technology",
keywords="medicine",
keywords="medical education",
keywords="medical curriculum",
keywords="medical school",
keywords="AI ethics",
keywords="ethics",
abstract="Background: The use of artificial intelligence (AI) in medicine not only directly impacts the medical profession but is also increasingly associated with various potential ethical aspects. In addition, the expanding use of AI and AI-based applications such as ChatGPT demands a corresponding shift in medical education to adequately prepare future practitioners for the effective use of these tools and address the associated ethical challenges they present. Objective: This study aims to explore how medical students from Germany, Austria, and Switzerland perceive the use of AI in medicine and the teaching of AI and AI ethics in medical education in accordance with their use of AI-based chat applications, such as ChatGPT. Methods: This cross-sectional study, conducted from June 15 to July 15, 2023, surveyed medical students across Germany, Austria, and Switzerland using a web-based survey. This study aimed to assess students' perceptions of AI in medicine and the integration of AI and AI ethics into medical education. The survey, which included 53 items across 6 sections, was developed and pretested. Data analysis used descriptive statistics (median, mode, IQR, total number, and percentages) and either the chi-square or Mann-Whitney U tests, as appropriate. Results: Surveying 487 medical students across Germany, Austria, and Switzerland revealed limited formal education on AI or AI ethics within medical curricula, although 38.8\% (189/487) had prior experience with AI-based chat applications, such as ChatGPT. Despite varied prior exposures, 71.7\% (349/487) anticipated a positive impact of AI on medicine. There was widespread consensus (385/487, 74.9\%) on the need for AI and AI ethics instruction in medical education, although the current offerings were deemed inadequate. Regarding the AI ethics education content, all proposed topics were rated as highly relevant. Conclusions: This study revealed a pronounced discrepancy between the use of AI-based (chat) applications, such as ChatGPT, among medical students in Germany, Austria, and Switzerland and the teaching of AI in medical education. To adequately prepare future medical professionals, there is an urgent need to integrate the teaching of AI and AI ethics into the medical curricula. ",
doi="10.2196/51247",
url="https://mededu.jmir.org/2024/1/e51247",
url="http://www.ncbi.nlm.nih.gov/pubmed/38180787"
}


@Article{info:doi/10.2196/51148,
author="Knoedler, Leonard
and Alfertshofer, Michael
and Knoedler, Samuel
and Hoch, C. Cosima
and Funk, F. Paul
and Cotofana, Sebastian
and Maheta, Bhagvat
and Frank, Konstantin
and Br{\'e}bant, Vanessa
and Prantl, Lukas
and Lamby, Philipp",
title="Pure Wisdom or Potemkin Villages? A Comparison of ChatGPT 3.5 and ChatGPT 4 on USMLE Step 3 Style Questions: Quantitative Analysis",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="5",
volume="10",
pages="e51148",
keywords="ChatGPT",
keywords="United States Medical Licensing Examination",
keywords="artificial intelligence",
keywords="USMLE",
keywords="USMLE Step 1",
keywords="OpenAI",
keywords="medical education",
keywords="clinical decision-making",
abstract="Background: The United States Medical Licensing Examination (USMLE) has been critical in medical education since 1992, testing various aspects of a medical student's knowledge and skills through different steps, based on their training level. Artificial intelligence (AI) tools, including chatbots like ChatGPT, are emerging technologies with potential applications in medicine. However, comprehensive studies analyzing ChatGPT's performance on USMLE Step 3 in large-scale scenarios and comparing different versions of ChatGPT are limited. Objective: This paper aimed to analyze ChatGPT's performance on USMLE Step 3 practice test questions to better elucidate the strengths and weaknesses of AI use in medical education and deduce evidence-based strategies to counteract AI cheating. Methods: A total of 2069 USMLE Step 3 practice questions were extracted from the AMBOSS study platform. After including 229 image-based questions, a total of 1840 text-based questions were further categorized and entered into ChatGPT 3.5, while a subset of 229 questions were entered into ChatGPT 4. Responses were recorded, and the accuracy of ChatGPT answers as well as its performance in different test question categories and for different difficulty levels were compared between both versions. Results: Overall, ChatGPT 4 demonstrated a statistically significant superior performance compared to ChatGPT 3.5, achieving an accuracy of 84.7\% (194/229) and 56.9\% (1047/1840), respectively. A noteworthy correlation was observed between the length of test questions and the performance of ChatGPT 3.5 ($\rho$=--0.069; P=.003), which was absent in ChatGPT 4 (P=.87). Additionally, the difficulty of test questions, as categorized by AMBOSS hammer ratings, showed a statistically significant correlation with performance for both ChatGPT versions, with $\rho$=--0.289 for ChatGPT 3.5 and $\rho$=--0.344 for ChatGPT 4. ChatGPT 4 surpassed ChatGPT 3.5 in all levels of test question difficulty, except for the 2 highest difficulty tiers (4 and 5 hammers), where statistical significance was not reached. Conclusions: In this study, ChatGPT 4 demonstrated remarkable proficiency in taking the USMLE Step 3, with an accuracy rate of 84.7\% (194/229), outshining ChatGPT 3.5 with an accuracy rate of 56.9\% (1047/1840). Although ChatGPT 4 performed exceptionally, it encountered difficulties in questions requiring the application of theoretical concepts, particularly in cardiology and neurology. These insights are pivotal for the development of examination strategies that are resilient to AI and underline the promising role of AI in the realm of medical education and diagnostics. ",
doi="10.2196/51148",
url="https://mededu.jmir.org/2024/1/e51148",
url="http://www.ncbi.nlm.nih.gov/pubmed/38180782"
}


@Article{info:doi/10.2196/51183,
author="Blease, Charlotte
and Torous, John
and McMillan, Brian
and H{\"a}gglund, Maria
and Mandl, D. Kenneth",
title="Generative Language Models and Open Notes: Exploring the Promise and Limitations",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="4",
volume="10",
pages="e51183",
keywords="ChatGPT",
keywords="generative language models",
keywords="large language models",
keywords="medical education",
keywords="Open Notes",
keywords="online record access",
keywords="patient-centered care",
keywords="empathy",
keywords="language model",
keywords="documentation",
keywords="communication tool",
keywords="clinical documentation",
doi="10.2196/51183",
url="https://mededu.jmir.org/2024/1/e51183",
url="http://www.ncbi.nlm.nih.gov/pubmed/38175688"
}


@Article{info:doi/10.2196/50869,
author="Erren, C. Thomas",
title="Patients, Doctors, and Chatbots",
journal="JMIR Med Educ",
year="2024",
month="Jan",
day="4",
volume="10",
pages="e50869",
keywords="chatbot",
keywords="ChatGPT",
keywords="medical advice",
keywords="ethics",
keywords="patients",
keywords="doctors",
doi="10.2196/50869",
url="https://mededu.jmir.org/2024/1/e50869",
url="http://www.ncbi.nlm.nih.gov/pubmed/38175695"
}


@Article{info:doi/10.2196/51199,
author="Koranteng, Erica
and Rao, Arya
and Flores, Efren
and Lev, Michael
and Landman, Adam
and Dreyer, Keith
and Succi, Marc",
title="Empathy and Equity: Key Considerations for Large Language Model Adoption in Health Care",
journal="JMIR Med Educ",
year="2023",
month="Dec",
day="28",
volume="9",
pages="e51199",
keywords="ChatGPT",
keywords="AI",
keywords="artificial intelligence",
keywords="large language models",
keywords="LLMs",
keywords="ethics",
keywords="empathy",
keywords="equity",
keywords="bias",
keywords="language model",
keywords="health care application",
keywords="patient care",
keywords="care",
keywords="development",
keywords="framework",
keywords="model",
keywords="ethical implication",
doi="10.2196/51199",
url="https://mededu.jmir.org/2023/1/e51199",
url="http://www.ncbi.nlm.nih.gov/pubmed/38153778"
}


@Article{info:doi/10.2196/48904,
author="Liao, Wenxiong
and Liu, Zhengliang
and Dai, Haixing
and Xu, Shaochen
and Wu, Zihao
and Zhang, Yiyang
and Huang, Xiaoke
and Zhu, Dajiang
and Cai, Hongmin
and Li, Quanzheng
and Liu, Tianming
and Li, Xiang",
title="Differentiating ChatGPT-Generated and Human-Written Medical Texts: Quantitative Study",
journal="JMIR Med Educ",
year="2023",
month="Dec",
day="28",
volume="9",
pages="e48904",
keywords="ChatGPT",
keywords="medical ethics",
keywords="linguistic analysis",
keywords="text classification",
keywords="artificial intelligence",
keywords="medical texts",
keywords="machine learning",
abstract="Background: Large language models, such as ChatGPT, are capable of generating grammatically perfect and human-like text content, and a large number of ChatGPT-generated texts have appeared on the internet. However, medical texts, such as clinical notes and diagnoses, require rigorous validation, and erroneous medical content generated by ChatGPT could potentially lead to disinformation that poses significant harm to health care and the general public. Objective: This study is among the first on responsible artificial intelligence--generated content in medicine. We focus on analyzing the differences between medical texts written by human experts and those generated by ChatGPT and designing machine learning workflows to effectively detect and differentiate medical texts generated by ChatGPT. Methods: We first constructed a suite of data sets containing medical texts written by human experts and generated by ChatGPT. We analyzed the linguistic features of these 2 types of content and uncovered differences in vocabulary, parts-of-speech, dependency, sentiment, perplexity, and other aspects. Finally, we designed and implemented machine learning methods to detect medical text generated by ChatGPT. The data and code used in this paper are published on GitHub. Results: Medical texts written by humans were more concrete, more diverse, and typically contained more useful information, while medical texts generated by ChatGPT paid more attention to fluency and logic and usually expressed general terminologies rather than effective information specific to the context of the problem. A bidirectional encoder representations from transformers--based model effectively detected medical texts generated by ChatGPT, and the F1 score exceeded 95\%. Conclusions: Although text generated by ChatGPT is grammatically perfect and human-like, the linguistic characteristics of generated medical texts were different from those written by human experts. Medical text generated by ChatGPT could be effectively detected by the proposed machine learning algorithms. This study provides a pathway toward trustworthy and accountable use of large language models in medicine. ",
doi="10.2196/48904",
url="https://mededu.jmir.org/2023/1/e48904",
url="http://www.ncbi.nlm.nih.gov/pubmed/38153785"
}


@Article{info:doi/10.2196/50373,
author="Knopp, I. Michelle
and Warm, J. Eric
and Weber, Danielle
and Kelleher, Matthew
and Kinnear, Benjamin
and Schumacher, J. Daniel
and Santen, A. Sally
and Mendon{\c{c}}a, Eneida
and Turner, Laurah",
title="AI-Enabled Medical Education: Threads of Change, Promising Futures, and Risky Realities Across Four Potential Future Worlds",
journal="JMIR Med Educ",
year="2023",
month="Dec",
day="25",
volume="9",
pages="e50373",
keywords="artificial intelligence",
keywords="medical education",
keywords="scenario planning",
keywords="future of healthcare",
keywords="ethics and AI",
keywords="future",
keywords="scenario",
keywords="ChatGPT",
keywords="generative",
keywords="GPT-4",
keywords="ethic",
keywords="ethics",
keywords="ethical",
keywords="strategic planning",
keywords="Open-AI",
keywords="OpenAI",
keywords="privacy",
keywords="autonomy",
keywords="autonomous",
abstract="Background: The rapid trajectory of artificial intelligence (AI) development and advancement is quickly outpacing society's ability to determine its future role. As AI continues to transform various aspects of our lives, one critical question arises for medical education: what will be the nature of education, teaching, and learning in a future world where the acquisition, retention, and application of knowledge in the traditional sense are fundamentally altered by AI? Objective: The purpose of this perspective is to plan for the intersection of health care and medical education in the future. Methods: We used GPT-4 and scenario-based strategic planning techniques to craft 4 hypothetical future worlds influenced by AI's integration into health care and medical education. This method, used by organizations such as Shell and the Accreditation Council for Graduate Medical Education, assesses readiness for alternative futures and effectively manages uncertainty, risk, and opportunity. The detailed scenarios provide insights into potential environments the medical profession may face and lay the foundation for hypothesis generation and idea-building regarding responsible AI implementation. Results: The following 4 worlds were created using OpenAI's GPT model: AI Harmony, AI conflict, The world of Ecological Balance, and Existential Risk. Risks include disinformation and misinformation, loss of privacy, widening inequity, erosion of human autonomy, and ethical dilemmas. Benefits involve improved efficiency, personalized interventions, enhanced collaboration, early detection, and accelerated research. Conclusions: To ensure responsible AI use, the authors suggest focusing on 3 key areas: developing a robust ethical framework, fostering interdisciplinary collaboration, and investing in education and training. A strong ethical framework emphasizes patient safety, privacy, and autonomy while promoting equity and inclusivity. Interdisciplinary collaboration encourages cooperation among various experts in developing and implementing AI technologies, ensuring that they address the complex needs and challenges in health care and medical education. Investing in education and training prepares professionals and trainees with necessary skills and knowledge to effectively use and critically evaluate AI technologies. The integration of AI in health care and medical education presents a critical juncture between transformative advancements and significant risks. By working together to address both immediate and long-term risks and consequences, we can ensure that AI integration leads to a more equitable, sustainable, and prosperous future for both health care and medical education. As we engage with AI technologies, our collective actions will ultimately determine the state of the future of health care and medical education to harness AI's power while ensuring the safety and well-being of humanity. ",
doi="10.2196/50373",
url="https://mededu.jmir.org/2023/1/e50373",
url="http://www.ncbi.nlm.nih.gov/pubmed/38145471"
}


@Article{info:doi/10.2196/51302,
author="Alkhaaldi, I. Saif M.
and Kassab, H. Carl
and Dimassi, Zakia
and Oyoun Alsoud, Leen
and Al Fahim, Maha
and Al Hageh, Cynthia
and Ibrahim, Halah",
title="Medical Student Experiences and Perceptions of ChatGPT and Artificial Intelligence: Cross-Sectional Study",
journal="JMIR Med Educ",
year="2023",
month="Dec",
day="22",
volume="9",
pages="e51302",
keywords="medical education",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="large language models",
keywords="LLMs",
keywords="AI",
keywords="medical student",
keywords="medical students",
keywords="cross-sectional study",
keywords="training",
keywords="technology",
keywords="medicine",
keywords="health care professionals",
keywords="risk",
keywords="education",
abstract="Background: Artificial intelligence (AI) has the potential to revolutionize the way medicine is learned, taught, and practiced, and medical education must prepare learners for these inevitable changes. Academic medicine has, however, been slow to embrace recent AI advances. Since its launch in November 2022, ChatGPT has emerged as a fast and user-friendly large language model that can assist health care professionals, medical educators, students, trainees, and patients. While many studies focus on the technology's capabilities, potential, and risks, there is a gap in studying the perspective of end users. Objective: The aim of this study was to gauge the experiences and perspectives of graduating medical students on ChatGPT and AI in their training and future careers. Methods: A cross-sectional web-based survey of recently graduated medical students was conducted in an international academic medical center between May 5, 2023, and June 13, 2023. Descriptive statistics were used to tabulate variable frequencies. Results: Of 325 applicants to the residency programs, 265 completed the survey (an 81.5\% response rate). The vast majority of respondents denied using ChatGPT in medical school, with 20.4\% (n=54) using it to help complete written assessments and only 9.4\% using the technology in their clinical work (n=25). More students planned to use it during residency, primarily for exploring new medical topics and research (n=168, 63.4\%) and exam preparation (n=151, 57\%). Male students were significantly more likely to believe that AI will improve diagnostic accuracy (n=47, 51.7\% vs n=69, 39.7\%; P=.001), reduce medical error (n=53, 58.2\% vs n=71, 40.8\%; P=.002), and improve patient care (n=60, 65.9\% vs n=95, 54.6\%; P=.007). Previous experience with AI was significantly associated with positive AI perception in terms of improving patient care, decreasing medical errors and misdiagnoses, and increasing the accuracy of diagnoses (P=.001, P<.001, P=.008, respectively). Conclusions: The surveyed medical students had minimal formal and informal experience with AI tools and limited perceptions of the potential uses of AI in health care but had overall positive views of ChatGPT and AI and were optimistic about the future of AI in medical education and health care. Structured curricula and formal policies and guidelines are needed to adequately prepare medical learners for the forthcoming integration of AI in medicine. ",
doi="10.2196/51302",
url="https://mededu.jmir.org/2023/1/e51302",
url="http://www.ncbi.nlm.nih.gov/pubmed/38133911"
}


@Article{info:doi/10.2196/50658,
author="Tangadulrat, Pasin
and Sono, Supinya
and Tangtrakulwanich, Boonsin",
title="Using ChatGPT for Clinical Practice and Medical Education: Cross-Sectional Survey of Medical Students' and Physicians' Perceptions",
journal="JMIR Med Educ",
year="2023",
month="Dec",
day="22",
volume="9",
pages="e50658",
keywords="ChatGPT",
keywords="AI",
keywords="artificial intelligence",
keywords="medical education",
keywords="medical students",
keywords="student",
keywords="students",
keywords="intern",
keywords="interns",
keywords="resident",
keywords="residents",
keywords="knee osteoarthritis",
keywords="survey",
keywords="surveys",
keywords="questionnaire",
keywords="questionnaires",
keywords="chatbot",
keywords="chatbots",
keywords="conversational agent",
keywords="conversational agents",
keywords="attitude",
keywords="attitudes",
keywords="opinion",
keywords="opinions",
keywords="perception",
keywords="perceptions",
keywords="perspective",
keywords="perspectives",
keywords="acceptance",
abstract="Background: ChatGPT is a well-known large language model--based chatbot. It could be used in the medical field in many aspects. However, some physicians are still unfamiliar with ChatGPT and are concerned about its benefits and risks. Objective: We aim to evaluate the perception of physicians and medical students toward using ChatGPT in the medical field. Methods: A web-based questionnaire was sent to medical students, interns, residents, and attending staff with questions regarding their perception toward using ChatGPT in clinical practice and medical education. Participants were also asked to rate their perception of ChatGPT's generated response about knee osteoarthritis. Results: Participants included 124 medical students, 46 interns, 37 residents, and 32 attending staff. After reading ChatGPT's response, 132 of the 239 (55.2\%) participants had a positive rating about using ChatGPT for clinical practice. The proportion of positive answers was significantly lower in graduated physicians (48/115, 42\%) compared with medical students (84/124, 68\%; P<.001). Participants listed a lack of a patient-specific treatment plan, updated evidence, and a language barrier as ChatGPT's pitfalls. Regarding using ChatGPT for medical education, the proportion of positive responses was also significantly lower in graduate physicians (71/115, 62\%) compared to medical students (103/124, 83.1\%; P<.001). Participants were concerned that ChatGPT's response was too superficial, might lack scientific evidence, and might need expert verification. Conclusions: Medical students generally had a positive perception of using ChatGPT for guiding treatment and medical education, whereas graduated doctors were more cautious in this regard. Nonetheless, both medical students and graduated doctors positively perceived using ChatGPT for creating patient educational materials. ",
doi="10.2196/50658",
url="https://mededu.jmir.org/2023/1/e50658",
url="http://www.ncbi.nlm.nih.gov/pubmed/38133908"
}


@Article{info:doi/10.2196/49183,
author="Buhr, Raphael Christoph
and Smith, Harry
and Huppertz, Tilman
and Bahr-Hamm, Katharina
and Matthias, Christoph
and Blaikie, Andrew
and Kelsey, Tom
and Kuhn, Sebastian
and Eckrich, Jonas",
title="ChatGPT Versus Consultants: Blinded Evaluation on Answering Otorhinolaryngology Case--Based Questions",
journal="JMIR Med Educ",
year="2023",
month="Dec",
day="5",
volume="9",
pages="e49183",
keywords="large language models",
keywords="LLMs",
keywords="LLM",
keywords="artificial intelligence",
keywords="AI",
keywords="ChatGPT",
keywords="otorhinolaryngology",
keywords="ORL",
keywords="digital health",
keywords="chatbots",
keywords="global health",
keywords="low- and middle-income countries",
keywords="telemedicine",
keywords="telehealth",
keywords="language model",
keywords="chatbot",
abstract="Background: Large language models (LLMs), such as ChatGPT (Open AI), are increasingly used in medicine and supplement standard search engines as information sources. This leads to more ``consultations'' of LLMs about personal medical symptoms. Objective: This study aims to evaluate ChatGPT's performance in answering clinical case--based questions in otorhinolaryngology (ORL) in comparison to ORL consultants' answers. Methods: We used 41 case-based questions from established ORL study books and past German state examinations for doctors. The questions were answered by both ORL consultants and ChatGPT 3. ORL consultants rated all responses, except their own, on medical adequacy, conciseness, coherence, and comprehensibility using a 6-point Likert scale. They also identified (in a blinded setting) if the answer was created by an ORL consultant or ChatGPT. Additionally, the character count was compared. Due to the rapidly evolving pace of technology, a comparison between responses generated by ChatGPT 3 and ChatGPT 4 was included to give an insight into the evolving potential of LLMs. Results: Ratings in all categories were significantly higher for ORL consultants (P<.001). Although inferior to the scores of the ORL consultants, ChatGPT's scores were relatively higher in semantic categories (conciseness, coherence, and comprehensibility) compared to medical adequacy. ORL consultants identified ChatGPT as the source correctly in 98.4\% (121/123) of cases. ChatGPT's answers had a significantly higher character count compared to ORL consultants (P<.001). Comparison between responses generated by ChatGPT 3 and ChatGPT 4 showed a slight improvement in medical accuracy as well as a better coherence of the answers provided. Contrarily, neither the conciseness (P=.06) nor the comprehensibility (P=.08)  improved significantly despite the significant increase in the mean amount of characters by 52.5\% (n= (1470-964)/964; P<.001). Conclusions: While ChatGPT provided longer answers to medical problems, medical adequacy and conciseness were significantly lower compared to ORL consultants' answers. LLMs have potential as augmentative tools for medical care, but their ``consultation'' for medical problems carries a high risk of misinformation as their high semantic quality may mask contextual deficits. ",
doi="10.2196/49183",
url="https://mededu.jmir.org/2023/1/e49183",
url="http://www.ncbi.nlm.nih.gov/pubmed/38051578"
}


@Article{info:doi/10.2196/51243,
author="Spallek, Sophia
and Birrell, Louise
and Kershaw, Stephanie
and Devine, Krogh Emma
and Thornton, Louise",
title="Can we use ChatGPT for Mental Health and Substance Use Education? Examining Its Quality and Potential Harms",
journal="JMIR Med Educ",
year="2023",
month="Nov",
day="30",
volume="9",
pages="e51243",
keywords="artificial intelligence",
keywords="generative artificial intelligence",
keywords="large language models",
keywords="ChatGPT",
keywords="medical education",
keywords="health education",
keywords="patient education handout",
keywords="preventive health services",
keywords="educational intervention",
keywords="mental health",
keywords="substance use",
abstract="Background: The use of generative artificial intelligence, more specifically large language models (LLMs), is proliferating, and as such, it is vital to consider both the value and potential harms of its use in medical education. Their efficiency in a variety of writing styles makes LLMs, such as ChatGPT, attractive for tailoring educational materials. However, this technology can feature biases and misinformation, which can be particularly harmful in medical education settings, such as mental health and substance use education. This viewpoint investigates if ChatGPT is sufficient for 2 common health education functions in the field of mental health and substance use: (1) answering users' direct queries and (2) aiding in the development of quality consumer educational health materials. Objective: This viewpoint includes a case study to provide insight into the accessibility, biases, and quality of ChatGPT's query responses and educational health materials. We aim to provide guidance for the general public and health educators wishing to utilize LLMs. Methods: We collected real world queries from 2 large-scale mental health and substance use portals and engineered a variety of prompts to use on GPT-4 Pro with the Bing BETA internet browsing plug-in. The outputs were evaluated with tools from the Sydney Health Literacy Lab to determine the accessibility, the adherence to Mindframe communication guidelines to identify biases, and author assessments on quality, including tailoring to audiences, duty of care disclaimers, and evidence-based internet references. Results: GPT-4's outputs had good face validity, but upon detailed analysis were substandard in comparison to expert-developed materials. Without engineered prompting, the reading level, adherence to communication guidelines, and use of evidence-based websites were poor. Therefore, all outputs still required cautious human editing and oversight. Conclusions: GPT-4 is currently not reliable enough for direct-consumer queries, but educators and researchers can use it for creating educational materials with caution. Materials created with LLMs should disclose the use of generative artificial intelligence and be evaluated on their efficacy with the target audience. ",
doi="10.2196/51243",
url="https://mededu.jmir.org/2023/1/e51243",
url="http://www.ncbi.nlm.nih.gov/pubmed/38032714"
}


@Article{info:doi/10.2196/47274,
author="Wong, Shin-Yee Rebecca
and Ming, Chiau Long
and Raja Ali, Affendi Raja",
title="The Intersection of ChatGPT, Clinical Medicine, and Medical Education",
journal="JMIR Med Educ",
year="2023",
month="Nov",
day="21",
volume="9",
pages="e47274",
keywords="ChatGPT",
keywords="clinical research",
keywords="large language model",
keywords="artificial intelligence",
keywords="ethical considerations",
keywords="AI",
keywords="OpenAI",
doi="10.2196/47274",
url="https://mededu.jmir.org/2023/1/e47274",
url="http://www.ncbi.nlm.nih.gov/pubmed/37988149"
}


@Article{info:doi/10.2196/49877,
author="Scherr, Riley
and Halaseh, F. Faris
and Spina, Aidin
and Andalib, Saman
and Rivera, Ronald",
title="ChatGPT Interactive Medical Simulations for Early Clinical Education: Case Study",
journal="JMIR Med Educ",
year="2023",
month="Nov",
day="10",
volume="9",
pages="e49877",
keywords="ChatGPT",
keywords="medical school simulations",
keywords="preclinical curriculum",
keywords="artificial intelligence",
keywords="AI",
keywords="AI in medical education",
keywords="medical education",
keywords="simulation",
keywords="generative",
keywords="curriculum",
keywords="clinical education",
keywords="simulations",
abstract="Background: The transition to clinical clerkships can be difficult for medical students, as it requires the synthesis and application of preclinical information into diagnostic and therapeutic decisions. ChatGPT---a generative language model with many medical applications due to its creativity, memory, and accuracy---can help students in this transition. Objective: This paper models ChatGPT 3.5's ability to perform interactive clinical simulations and shows this tool's benefit to medical education. Methods: Simulation starting prompts were refined using ChatGPT 3.5 in Google Chrome. Starting prompts were selected based on assessment format, stepwise progression of simulation events and questions, free-response question type, responsiveness to user inputs, postscenario feedback, and medical accuracy of the feedback. The chosen scenarios were advanced cardiac life support and medical intensive care (for sepsis and pneumonia). Results: Two starting prompts were chosen. Prompt 1 was developed through 3 test simulations and used successfully in 2 simulations. Prompt 2 was developed through 10 additional test simulations and used successfully in 1 simulation. Conclusions: ChatGPT is capable of creating simulations for early clinical education. These simulations let students practice novel parts of the clinical curriculum, such as forming independent diagnostic and therapeutic impressions over an entire patient encounter. Furthermore, the simulations can adapt to user inputs in a way that replicates real life more accurately than premade question bank clinical vignettes. Finally, ChatGPT can create potentially unlimited free simulations with specific feedback, which increases access for medical students with lower socioeconomic status and underresourced medical schools. However, no tool is perfect, and ChatGPT is no exception; there are concerns about simulation accuracy and replicability that need to be addressed to further optimize ChatGPT's performance as an educational resource. ",
doi="10.2196/49877",
url="https://mededu.jmir.org/2023/1/e49877",
url="http://www.ncbi.nlm.nih.gov/pubmed/37948112"
}


@Article{info:doi/10.2196/49459,
author="Abuyaman, Omar",
title="Strengths and Weaknesses of ChatGPT Models for Scientific Writing About Medical Vitamin B12: Mixed Methods Study",
journal="JMIR Form Res",
year="2023",
month="Nov",
day="10",
volume="7",
pages="e49459",
keywords="AI",
keywords="ChatGPT",
keywords="GPT-4",
keywords="GPT-3.5",
keywords="vitamin B12",
keywords="artificial intelligence",
keywords="language editing",
keywords="wide range information",
keywords="AI solutions",
keywords="scientific content",
abstract="Background: ChatGPT is a large language model developed by OpenAI designed to generate human-like responses to prompts. Objective: This study aims to evaluate the ability of GPT-4 to generate scientific content and assist in scientific writing using medical vitamin B12 as the topic. Furthermore, the study will compare the performance of GPT-4 to its predecessor, GPT-3.5. Methods: The study examined responses from GPT-4 and GPT-3.5 to vitamin B12--related prompts, focusing on their quality and characteristics and comparing them to established scientific literature. Results: The results indicated that GPT-4 can potentially streamline scientific writing through its ability to edit language and write abstracts, keywords, and abbreviation lists. However, significant limitations of ChatGPT were revealed, including its inability to identify and address bias, inability to include recent information, lack of transparency, and inclusion of inaccurate information. Additionally, it cannot check for plagiarism or provide proper references. The accuracy of GPT-4's answers was found to be superior to GPT-3.5. Conclusions: ChatGPT can be considered a helpful assistant in the writing process but not a replacement for a scientist's expertise. Researchers must remain aware of its limitations and use it appropriately. The improvements in consecutive ChatGPT versions suggest the possibility of overcoming some present limitations in the near future. ",
doi="10.2196/49459",
url="https://formative.jmir.org/2023/1/e49459",
url="http://www.ncbi.nlm.nih.gov/pubmed/37948100"
}


@Article{info:doi/10.2196/47191,
author="Surapaneni, Mohan Krishna",
title="Assessing the Performance of ChatGPT in Medical Biochemistry Using Clinical Case Vignettes: Observational Study",
journal="JMIR Med Educ",
year="2023",
month="Nov",
day="7",
volume="9",
pages="e47191",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="medical education",
keywords="medical Biochemistry",
keywords="biochemistry",
keywords="chatbot",
keywords="case study",
keywords="case scenario",
keywords="medical exam",
keywords="medical examination",
keywords="computer generated",
abstract="Background: ChatGPT has gained global attention recently owing to its high performance in generating a wide range of information and retrieving any kind of data instantaneously. ChatGPT has also been tested for the United States Medical Licensing Examination (USMLE) and has successfully cleared it. Thus, its usability in medical education is now one of the key discussions worldwide. Objective: The objective of this study is to evaluate the performance of ChatGPT in medical biochemistry using clinical case vignettes. Methods: The performance of ChatGPT was evaluated in medical biochemistry using 10 clinical case vignettes. Clinical case vignettes were randomly selected and inputted in ChatGPT along with the response options. We tested the responses for each clinical case twice. The answers generated by ChatGPT were saved and checked using our reference material. Results: ChatGPT generated correct answers for 4 questions on the first attempt. For the other cases, there were differences in responses generated by ChatGPT in the first and second attempts. In the second attempt, ChatGPT provided correct answers for 6 questions and incorrect answers for 4 questions out of the 10 cases that were used. But, to our surprise, for case 3, different answers were obtained with multiple attempts. We believe this to have happened owing to the complexity of the case, which involved addressing various critical medical aspects related to amino acid metabolism in a balanced approach. Conclusions: According to the findings of our study, ChatGPT may not be considered an accurate information provider for application in medical education to improve learning and assessment. However, our study was limited by a small sample size (10 clinical case vignettes) and the use of the publicly available version of ChatGPT (version 3.5). Although artificial intelligence (AI) has the capability to transform medical education, we emphasize the validation of such data produced by such AI systems for correctness and dependability before it could be implemented in practice. ",
doi="10.2196/47191",
url="https://mededu.jmir.org/2023/1/e47191",
url="http://www.ncbi.nlm.nih.gov/pubmed/37934568"
}


@Article{info:doi/10.2196/47532,
author="Ito, Naoki
and Kadomatsu, Sakina
and Fujisawa, Mineto
and Fukaguchi, Kiyomitsu
and Ishizawa, Ryo
and Kanda, Naoki
and Kasugai, Daisuke
and Nakajima, Mikio
and Goto, Tadahiro
and Tsugawa, Yusuke",
title="The Accuracy and Potential Racial and Ethnic Biases of GPT-4 in the Diagnosis and Triage of Health Conditions: Evaluation Study",
journal="JMIR Med Educ",
year="2023",
month="Nov",
day="2",
volume="9",
pages="e47532",
keywords="GPT-4",
keywords="racial and ethnic bias",
keywords="typical clinical vignettes",
keywords="diagnosis",
keywords="triage",
keywords="artificial intelligence",
keywords="AI",
keywords="race",
keywords="clinical vignettes",
keywords="physician",
keywords="efficiency",
keywords="decision-making",
keywords="bias",
keywords="GPT",
abstract="Background: Whether GPT-4, the conversational artificial intelligence, can accurately diagnose and triage health conditions and whether it presents racial and ethnic biases in its decisions remain unclear. Objective: We aim to assess the accuracy of GPT-4 in the diagnosis and triage of health conditions and whether its performance varies by patient race and ethnicity. Methods: We compared the performance of GPT-4 and physicians, using 45 typical clinical vignettes, each with a correct diagnosis and triage level, in February and March 2023. For each of the 45 clinical vignettes, GPT-4 and 3 board-certified physicians provided the most likely primary diagnosis and triage level (emergency, nonemergency, or self-care). Independent reviewers evaluated the diagnoses as ``correct'' or ``incorrect.'' Physician diagnosis was defined as the consensus of the 3 physicians. We evaluated whether the performance of GPT-4 varies by patient race and ethnicity, by adding the information on patient race and ethnicity to the clinical vignettes. Results: The accuracy of diagnosis was comparable between GPT-4 and physicians (the percentage of correct diagnosis was 97.8\% (44/45; 95\% CI 88.2\%-99.9\%) for GPT-4 and 91.1\% (41/45; 95\% CI 78.8\%-97.5\%) for physicians; P=.38). GPT-4 provided appropriate reasoning for 97.8\% (44/45) of the vignettes. The appropriateness of triage was comparable between GPT-4 and physicians (GPT-4: 30/45, 66.7\%; 95\% CI 51.0\%-80.0\%; physicians: 30/45, 66.7\%; 95\% CI 51.0\%-80.0\%; P=.99). The performance of GPT-4 in diagnosing health conditions did not vary among different races and ethnicities (Black, White, Asian, and Hispanic), with an accuracy of 100\% (95\% CI 78.2\%-100\%). P values, compared to the GPT-4 output without incorporating race and ethnicity information, were all .99. The accuracy of triage was not significantly different even if patients' race and ethnicity information was added. The accuracy of triage was 62.2\% (95\% CI 46.5\%-76.2\%; P=.50) for Black patients; 66.7\% (95\% CI 51.0\%-80.0\%; P=.99) for White patients; 66.7\% (95\% CI 51.0\%-80.0\%; P=.99) for Asian patients, and 62.2\% (95\% CI 46.5\%-76.2\%; P=.69) for Hispanic patients. P values were calculated by comparing the outputs with and without conditioning on race and ethnicity. Conclusions: GPT-4's ability to diagnose and triage typical clinical vignettes was comparable to that of board-certified physicians. The performance of GPT-4 did not vary by patient race and ethnicity. These findings should be informative for health systems looking to introduce conversational artificial intelligence to improve the efficiency of patient diagnosis and triage. ",
doi="10.2196/47532",
url="https://mededu.jmir.org/2023/1/e47532",
url="http://www.ncbi.nlm.nih.gov/pubmed/37917120"
}


@Article{info:doi/10.2196/51421,
author="Baglivo, Francesco
and De Angelis, Luigi
and Casigliani, Virginia
and Arzilli, Guglielmo
and Privitera, Pierpaolo Gaetano
and Rizzo, Caterina",
title="Exploring the Possible Use of AI Chatbots in Public Health Education: Feasibility Study",
journal="JMIR Med Educ",
year="2023",
month="Nov",
day="1",
volume="9",
pages="e51421",
keywords="artificial intelligence",
keywords="chatbots",
keywords="medical education",
keywords="vaccination",
keywords="public health",
keywords="medical students",
keywords="large language model",
keywords="generative AI",
keywords="ChatGPT",
keywords="Google Bard",
keywords="AI chatbot",
keywords="health education",
keywords="health care",
keywords="medical training",
keywords="educational support tool",
keywords="chatbot model",
abstract="Background: Artificial intelligence (AI) is a rapidly developing field with the potential to transform various aspects of health care and public health, including medical training. During the ``Hygiene and Public Health'' course for fifth-year medical students, a practical training session was conducted on vaccination using AI chatbots as an educational supportive tool. Before receiving specific training on vaccination, the students were given a web-based test extracted from the Italian National Medical Residency Test. After completing the test, a critical correction of each question was performed assisted by AI chatbots. Objective: The main aim of this study was to identify whether AI chatbots can be considered educational support tools for training in public health. The secondary objective was to assess the performance of different AI chatbots on complex multiple-choice medical questions in the Italian language. Methods: A test composed of 15 multiple-choice questions on vaccination was extracted from the Italian National Medical Residency Test using targeted keywords and administered to medical students via Google Forms and to different AI chatbot models (Bing Chat, ChatGPT, Chatsonic, Google Bard, and YouChat). The correction of the test was conducted in the classroom, focusing on the critical evaluation of the explanations provided by the chatbot. A Mann-Whitney U test was conducted to compare the performances of medical students and AI chatbots. Student feedback was collected anonymously at the end of the training experience. Results: In total, 36 medical students and 5 AI chatbot models completed the test. The students achieved an average score of 8.22 (SD 2.65) out of 15, while the AI chatbots scored an average of 12.22 (SD 2.77). The results indicated a statistically significant difference in performance between the 2 groups (U=49.5, P<.001), with a large effect size (r=0.69). When divided by question type (direct, scenario-based, and negative), significant differences were observed in direct (P<.001) and scenario-based (P<.001) questions, but not in negative questions (P=.48). The students reported a high level of satisfaction (7.9/10) with the educational experience, expressing a strong desire to repeat the experience (7.6/10). Conclusions: This study demonstrated the efficacy of AI chatbots in answering complex medical questions related to vaccination and providing valuable educational support. Their performance significantly surpassed that of medical students in direct and scenario-based questions. The responsible and critical use of AI chatbots can enhance medical education, making it an essential aspect to integrate into the educational system. ",
doi="10.2196/51421",
url="https://mededu.jmir.org/2023/1/e51421",
url="http://www.ncbi.nlm.nih.gov/pubmed/37910155"
}


@Article{info:doi/10.2196/48452,
author="Kunitsu, Yuki",
title="The Potential of GPT-4 as a Support Tool for Pharmacists: Analytical Study Using the Japanese National Examination for Pharmacists",
journal="JMIR Med Educ",
year="2023",
month="Oct",
day="30",
volume="9",
pages="e48452",
keywords="natural language processing",
keywords="generative pretrained transformer",
keywords="GPT-4",
keywords="ChatGPT",
keywords="artificial intelligence",
keywords="AI",
keywords="chatbot",
keywords="pharmacy",
keywords="pharmacist",
abstract="Background: The advancement of artificial intelligence (AI), as well as machine learning, has led to its application in various industries, including health care. AI chatbots, such as GPT-4, developed by OpenAI, have demonstrated potential in supporting health care professionals by providing medical information, answering examination questions, and assisting in medical education. However, the applicability of GPT-4 in the field of pharmacy remains unexplored. Objective: This study aimed to evaluate GPT-4's ability to answer questions from the Japanese National Examination for Pharmacists (JNEP) and assess its potential as a support tool for pharmacists in their daily practice. Methods: The question texts and answer choices from the 107th and 108th JNEP, held in February 2022 and February 2023, were input into GPT-4. As GPT-4 cannot process diagrams, questions that included diagram interpretation were not analyzed and were initially given a score of 0. The correct answer rates were calculated and compared with the passing criteria of each examination to evaluate GPT-4's performance. Results: For the 107th and 108th JNEP, GPT-4 achieved an accuracy rate of 64.5\% (222/344) and 62.9\% (217/345), respectively, for all questions. When considering only the questions that GPT-4 could answer, the accuracy rates increased to 78.2\% (222/284) and 75.3\% (217/287), respectively. The accuracy rates tended to be lower for physics, chemistry, and calculation questions. Conclusions: Although GPT-4 demonstrated the potential to answer questions from the JNEP and support pharmacists' capabilities, it also showed limitations in handling highly specialized questions, calculation questions, and questions requiring diagram recognition. Further evaluation is necessary to explore its applicability in real-world clinical settings, considering the complexities of patient scenarios and collaboration with health care professionals. By addressing these limitations, GPT-4 could become a more reliable tool for pharmacists in their daily practice. ",
doi="10.2196/48452",
url="https://mededu.jmir.org/2023/1/e48452",
url="http://www.ncbi.nlm.nih.gov/pubmed/37837968"
}


@Article{info:doi/10.2196/48785,
author="Preiksaitis, Carl
and Rose, Christian",
title="Opportunities, Challenges, and Future Directions of Generative Artificial Intelligence in Medical Education: Scoping Review",
journal="JMIR Med Educ",
year="2023",
month="Oct",
day="20",
volume="9",
pages="e48785",
keywords="medical education",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="Bard",
keywords="AI",
keywords="educator",
keywords="scoping",
keywords="review",
keywords="learner",
keywords="generative",
abstract="Background: Generative artificial intelligence (AI) technologies are increasingly being utilized across various fields, with considerable interest and concern regarding their potential application in medical education. These technologies, such as Chat GPT and Bard, can generate new content and have a wide range of possible applications. Objective: This study aimed to synthesize the potential opportunities and limitations of generative AI in medical education. It sought to identify prevalent themes within recent literature regarding potential applications and challenges of generative AI in medical education and use these to guide future areas for exploration. Methods: We conducted a scoping review, following the framework by Arksey and O'Malley, of English language articles published from 2022 onward that discussed generative AI in the context of medical education. A literature search was performed using PubMed, Web of Science, and Google Scholar databases. We screened articles for inclusion, extracted data from relevant studies, and completed a quantitative and qualitative synthesis of the data. Results: Thematic analysis revealed diverse potential applications for generative AI in medical education, including self-directed learning, simulation scenarios, and writing assistance. However, the literature also highlighted significant challenges, such as issues with academic integrity, data accuracy, and potential detriments to learning. Based on these themes and the current state of the literature, we propose the following 3 key areas for investigation: developing learners' skills to evaluate AI critically, rethinking assessment methodology, and studying human-AI interactions. Conclusions: The integration of generative AI in medical education presents exciting opportunities, alongside considerable challenges. There is a need to develop new skills and competencies related to AI as well as thoughtful, nuanced approaches to examine the growing use of generative AI in medical education. ",
doi="10.2196/48785",
url="https://mededu.jmir.org/2023/1/e48785/"
}


@Article{info:doi/10.2196/48023,
author="Yanagita, Yasutaka
and Yokokawa, Daiki
and Uchida, Shun
and Tawara, Junsuke
and Ikusaka, Masatomi",
title="Accuracy of ChatGPT on Medical Questions in the National Medical Licensing Examination in Japan: Evaluation Study",
journal="JMIR Form Res",
year="2023",
month="Oct",
day="13",
volume="7",
pages="e48023",
keywords="artificial intelligence",
keywords="ChatGPT",
keywords="GPT-4",
keywords="AI",
keywords="National Medical Licensing Examination",
keywords="Japanese",
keywords="NMLE",
abstract="Background: ChatGPT (OpenAI) has gained considerable attention because of its natural and intuitive responses. ChatGPT sometimes writes plausible-sounding but incorrect or nonsensical answers, as stated by OpenAI as a limitation. However, considering that ChatGPT is an interactive AI that has been trained to reduce the output of unethical sentences, the reliability of the training data is high and the usefulness of the output content is promising. Fortunately, in March 2023, a new version of ChatGPT, GPT-4, was released, which, according to internal evaluations, was expected to increase the likelihood of producing factual responses by 40\% compared with its predecessor, GPT-3.5. The usefulness of this version of ChatGPT in English is widely appreciated. It is also increasingly being evaluated as a system for obtaining medical information in languages other than English. Although it does not reach a passing score on the national medical examination in Chinese, its accuracy is expected to gradually improve. Evaluation of ChatGPT with Japanese input is limited, although there have been reports on the accuracy of ChatGPT's answers to clinical questions regarding the Japanese Society of Hypertension guidelines and on the performance of the National Nursing Examination. Objective: The objective of this study is to evaluate whether ChatGPT can provide accurate diagnoses and medical knowledge for Japanese input. Methods: Questions from the National Medical Licensing Examination (NMLE) in Japan, administered by the Japanese Ministry of Health, Labour and Welfare in 2022, were used. All 400 questions were included. Exclusion criteria were figures and tables that ChatGPT could not recognize; only text questions were extracted. We instructed GPT-3.5 and GPT-4 to input the Japanese questions as they were and to output the correct answers for each question. The output of ChatGPT was verified by 2 general practice physicians. In case of discrepancies, they were checked by another physician to make a final decision. The overall performance was evaluated by calculating the percentage of correct answers output by GPT-3.5 and GPT-4. Results: Of the 400 questions, 292 were analyzed. Questions containing charts, which are not supported by ChatGPT, were excluded. The correct response rate for GPT-4 was 81.5\% (237/292), which was significantly higher than the rate for GPT-3.5, 42.8\% (125/292). Moreover, GPT-4 surpassed the passing standard (>72\%) for the NMLE, indicating its potential as a diagnostic and therapeutic decision aid for physicians. Conclusions: GPT-4 reached the passing standard for the NMLE in Japan, entered in Japanese, although it is limited to written questions. As the accelerated progress in the past few months has shown, the performance of the AI will improve as the large language model continues to learn more, and it may well become a decision support system for medical professionals by providing more accurate information. ",
doi="10.2196/48023",
url="https://formative.jmir.org/2023/1/e48023",
url="http://www.ncbi.nlm.nih.gov/pubmed/37831496"
}


@Article{info:doi/10.2196/48039,
author="Flores-Cohaila, A. Javier
and Garc{\'i}a-Vicente, Abiga{\'i}l
and Vizcarra-Jim{\'e}nez, F. Sonia
and De la Cruz-Gal{\'a}n, P. Janith
and Guti{\'e}rrez-Arratia, D. Jes{\'u}s
and Quiroga Torres, Geraldine Blanca
and Taype-Rondan, Alvaro",
title="Performance of ChatGPT on the Peruvian National Licensing Medical Examination: Cross-Sectional Study",
journal="JMIR Med Educ",
year="2023",
month="Sep",
day="28",
volume="9",
pages="e48039",
keywords="medical education",
keywords="generative pre-trained transformer",
keywords="ChatGPT",
keywords="licensing examination",
keywords="assessment",
keywords="Peru",
keywords="Examen Nacional de Medicina",
keywords="ENAM",
keywords="learning model",
keywords="artificial intelligence",
keywords="AI",
keywords="medical examination",
abstract="Background: ChatGPT has shown impressive performance in national medical licensing examinations, such as the United States Medical Licensing Examination (USMLE), even passing it with expert-level performance. However, there is a lack of research on its performance in low-income countries' national licensing medical examinations. In Peru, where almost one out of three examinees fails the national licensing medical examination, ChatGPT has the potential to enhance medical education. Objective: We aimed to assess the accuracy of ChatGPT using GPT-3.5 and GPT-4 on the Peruvian National Licensing Medical Examination (Examen Nacional de Medicina [ENAM]). Additionally, we sought to identify factors associated with incorrect answers provided by ChatGPT. Methods: We used the ENAM 2022 data set, which consisted of 180 multiple-choice questions, to evaluate the performance of ChatGPT. Various prompts were used, and accuracy was evaluated. The performance of ChatGPT was compared to that of a sample of 1025 examinees. Factors such as question type, Peruvian-specific knowledge, discrimination, difficulty, quality of questions, and subject were analyzed to determine their influence on incorrect answers. Questions that received incorrect answers underwent a three-step process involving different prompts to explore the potential impact of adding roles and context on ChatGPT's accuracy. Results: GPT-4 achieved an accuracy of 86\% on the ENAM, followed by GPT-3.5 with 77\%. The accuracy obtained by the 1025 examinees was 55\%. There was a fair agreement ($\kappa$=0.38) between GPT-3.5 and GPT-4. Moderate-to-high-difficulty questions were associated with incorrect answers in the crude and adjusted model for GPT-3.5 (odds ratio [OR] 6.6, 95\% CI 2.73-15.95) and GPT-4 (OR 33.23, 95\% CI 4.3-257.12). After reinputting questions that received incorrect answers, GPT-3.5 went from 41 (100\%) to 12 (29\%) incorrect answers, and GPT-4 from 25 (100\%) to 4 (16\%). Conclusions: Our study found that ChatGPT (GPT-3.5 and GPT-4) can achieve expert-level performance on the ENAM, outperforming most of our examinees. We found fair agreement between both GPT-3.5 and GPT-4. Incorrect answers were associated with the difficulty of questions, which may resemble human performance. Furthermore, by reinputting questions that initially received incorrect answers with different prompts containing additional roles and context, ChatGPT achieved improved accuracy. ",
doi="10.2196/48039",
url="https://mededu.jmir.org/2023/1/e48039",
url="http://www.ncbi.nlm.nih.gov/pubmed/37768724"
}


@Article{info:doi/10.2196/50514,
author="Huang, ST Ryan
and Lu, Qi Kevin Jia
and Meaney, Christopher
and Kemppainen, Joel
and Punnett, Angela
and Leung, Fok-Han",
title="Assessment of Resident and AI Chatbot Performance on the University of Toronto Family Medicine Residency Progress Test: Comparative Study",
journal="JMIR Med Educ",
year="2023",
month="Sep",
day="19",
volume="9",
pages="e50514",
keywords="medical education",
keywords="medical knowledge exam",
keywords="artificial intelligence",
keywords="AI",
keywords="natural language processing",
keywords="NLP",
keywords="large language model",
keywords="LLM",
keywords="machine learning, ChatGPT",
keywords="GPT-3.5",
keywords="GPT-4",
keywords="education",
keywords="language model",
keywords="education examination",
keywords="testing",
keywords="utility",
keywords="family medicine",
keywords="medical residents",
keywords="test",
keywords="community",
abstract="Background: Large language model (LLM)--based chatbots are evolving at an unprecedented pace with the release of ChatGPT, specifically GPT-3.5, and its successor, GPT-4. Their capabilities in general-purpose tasks and language generation have advanced to the point of performing excellently on various educational examination benchmarks, including medical knowledge tests. Comparing the performance of these 2 LLM models to that of Family Medicine residents on a multiple-choice medical knowledge test can provide insights into their potential as medical education tools. Objective: This study aimed to quantitatively and qualitatively compare the performance of GPT-3.5, GPT-4, and Family Medicine residents in a multiple-choice medical knowledge test appropriate for the level of a Family Medicine resident. Methods: An official University of Toronto Department of Family and Community Medicine Progress Test consisting of multiple-choice questions was inputted into GPT-3.5 and GPT-4. The artificial intelligence chatbot's responses were manually reviewed to determine the selected answer, response length, response time, provision of a rationale for the outputted response, and the root cause of all incorrect responses (classified into arithmetic, logical, and information errors). The performance of the artificial intelligence chatbots were compared against a cohort of Family Medicine residents who concurrently attempted the test. Results: GPT-4 performed significantly better compared to GPT-3.5 (difference 25.0\%, 95\% CI 16.3\%-32.8\%; McNemar test: P<.001); it correctly answered 89/108 (82.4\%) questions, while GPT-3.5 answered 62/108 (57.4\%) questions correctly. Further, GPT-4 scored higher across all 11 categories of Family Medicine knowledge. In 86.1\% (n=93) of the responses, GPT-4 provided a rationale for why other multiple-choice options were not chosen compared to the 16.7\% (n=18) achieved by GPT-3.5. Qualitatively, for both GPT-3.5 and GPT-4 responses, logical errors were the most common, while arithmetic errors were the least common. The average performance of Family Medicine residents was 56.9\% (95\% CI 56.2\%-57.6\%). The performance of GPT-3.5 was similar to that of the average Family Medicine resident (P=.16), while the performance of GPT-4 exceeded that of the top-performing Family Medicine resident (P<.001). Conclusions: GPT-4 significantly outperforms both GPT-3.5 and Family Medicine residents on a multiple-choice medical knowledge test designed for Family Medicine residents. GPT-4 provides a logical rationale for its response choice, ruling out other answer choices efficiently and with concise justification. Its high degree of accuracy and advanced reasoning capabilities facilitate its potential applications in medical education, including the creation of exam questions and scenarios as well as serving as a resource for medical knowledge or information on community services. ",
doi="10.2196/50514",
url="https://mededu.jmir.org/2023/1/e50514",
url="http://www.ncbi.nlm.nih.gov/pubmed/37725411"
}


@Article{info:doi/10.2196/47049,
author="Khlaif, N. Zuheir
and Mousa, Allam
and Hattab, Kamal Muayad
and Itmazi, Jamil
and Hassan, A. Amjad
and Sanmugam, Mageswaran
and Ayyoub, Abedalkarim",
title="The Potential and Concerns of Using AI in Scientific Research: ChatGPT Performance Evaluation",
journal="JMIR Med Educ",
year="2023",
month="Sep",
day="14",
volume="9",
pages="e47049",
keywords="artificial intelligence",
keywords="AI",
keywords="ChatGPT",
keywords="scientific research",
keywords="research ethics",
abstract="Background: Artificial intelligence (AI) has many applications in various aspects of our daily life, including health, criminal, education, civil, business, and liability law. One aspect of AI that has gained significant attention is natural language processing (NLP), which refers to the ability of computers to understand and generate human language. Objective: This study aims to examine the potential for, and concerns of, using AI in scientific research. For this purpose, high-impact research articles were generated by analyzing the quality of reports generated by ChatGPT and assessing the application's impact on the research framework, data analysis, and the literature review. The study also explored concerns around ownership and the integrity of research when using AI-generated text. Methods: A total of 4 articles were generated using ChatGPT, and thereafter evaluated by 23 reviewers. The researchers developed an evaluation form to assess the quality of the articles generated. Additionally, 50 abstracts were generated using ChatGPT and their quality was evaluated. The data were subjected to ANOVA and thematic analysis to analyze the qualitative data provided by the reviewers. Results: When using detailed prompts and providing the context of the study, ChatGPT would generate high-quality research that could be published in high-impact journals. However, ChatGPT had a minor impact on developing the research framework and data analysis. The primary area needing improvement was the development of the literature review. Moreover, reviewers expressed concerns around ownership and the integrity of the research when using AI-generated text. Nonetheless, ChatGPT has a strong potential to increase human productivity in research and can be used in academic writing. Conclusions: AI-generated text has the potential to improve the quality of high-impact research articles. The findings of this study suggest that decision makers and researchers should focus more on the methodology part of the research, which includes research design, developing research tools, and analyzing data in depth, to draw strong theoretical and practical implications, thereby establishing a revolution in scientific research in the era of AI. The practical implications of this study can be used in different fields such as medical education to deliver materials to develop the basic competencies for both medicine students and faculty members. ",
doi="10.2196/47049",
url="https://mededu.jmir.org/2023/1/e47049",
url="http://www.ncbi.nlm.nih.gov/pubmed/37707884"
}


@Article{info:doi/10.2196/48254,
author="Sallam, Malik
and Salim, A. Nesreen
and Barakat, Muna
and Al-Mahzoum, Kholoud
and Al-Tammemi, B. Ala'a
and Malaeb, Diana
and Hallit, Rabih
and Hallit, Souheil",
title="Assessing Health Students' Attitudes and Usage of ChatGPT in Jordan: Validation Study",
journal="JMIR Med Educ",
year="2023",
month="Sep",
day="5",
volume="9",
pages="e48254",
keywords="artificial intelligence",
keywords="machine learning",
keywords="education",
keywords="technology",
keywords="healthcare",
keywords="survey",
keywords="opinion",
keywords="knowledge",
keywords="practices",
keywords="KAP",
abstract="Background: ChatGPT is a conversational large language model that has the potential to revolutionize knowledge acquisition. However, the impact of this technology on the quality of education is still unknown considering the risks and concerns surrounding ChatGPT use. Therefore, it is necessary to assess the usability and acceptability of this promising tool. As an innovative technology, the intention to use ChatGPT can be studied in the context of the technology acceptance model (TAM). Objective: This study aimed to develop and validate a TAM-based survey instrument called TAME-ChatGPT (Technology Acceptance Model Edited to Assess ChatGPT Adoption) that could be employed to examine the successful integration and use of ChatGPT in health care education. Methods: The survey tool was created based on the TAM framework. It comprised 13 items for participants who heard of ChatGPT but did not use it and 23 items for participants who used ChatGPT. Using a convenient sampling approach, the survey link was circulated electronically among university students between February and March 2023. Exploratory factor analysis (EFA) was used to assess the construct validity of the survey instrument. Results: The final sample comprised 458 respondents, the majority among them undergraduate students (n=442, 96.5\%). Only 109 (23.8\%) respondents had heard of ChatGPT prior to participation and only 55 (11.3\%) self-reported ChatGPT use before the study. EFA analysis on the attitude and usage scales showed significant Bartlett tests of sphericity scores (P<.001) and adequate Kaiser-Meyer-Olkin measures (0.823 for the attitude scale and 0.702 for the usage scale), confirming the factorability of the correlation matrices. The EFA showed that 3 constructs explained a cumulative total of 69.3\% variance in the attitude scale, and these subscales represented perceived risks, attitude to technology/social influence, and anxiety. For the ChatGPT usage scale, EFA showed that 4 constructs explained a cumulative total of 72\% variance in the data and comprised the perceived usefulness, perceived risks, perceived ease of use, and behavior/cognitive factors. All the ChatGPT attitude and usage subscales showed good reliability with Cronbach $\alpha$ values >.78 for all the deduced subscales. Conclusions: The TAME-ChatGPT demonstrated good reliability, validity, and usefulness in assessing health care students' attitudes toward ChatGPT. The findings highlighted the importance of considering risk perceptions, usefulness, ease of use, attitudes toward technology, and behavioral factors when adopting ChatGPT as a tool in health care education. This information can aid the stakeholders in creating strategies to support the optimal and ethical use of ChatGPT and to identify the potential challenges hindering its successful implementation. Future research is recommended to guide the effective adoption of ChatGPT in health care education. ",
doi="10.2196/48254",
url="https://mededu.jmir.org/2023/1/e48254",
url="http://www.ncbi.nlm.nih.gov/pubmed/37578934"
}


@Article{info:doi/10.2196/46482,
author="Roos, Jonas
and Kasapovic, Adnan
and Jansen, Tom
and Kaczmarczyk, Robert",
title="Artificial Intelligence in Medical Education: Comparative Analysis of ChatGPT, Bing, and Medical Students in Germany",
journal="JMIR Med Educ",
year="2023",
month="Sep",
day="4",
volume="9",
pages="e46482",
keywords="medical education",
keywords="state examinations",
keywords="exams",
keywords="large language models",
keywords="artificial intelligence",
keywords="ChatGPT",
abstract="Background: Large language models (LLMs) have demonstrated significant potential in diverse domains, including medicine. Nonetheless, there is a scarcity of studies examining their performance in medical examinations, especially those conducted in languages other than English, and in direct comparison with medical students. Analyzing the performance of LLMs in state medical examinations can provide insights into their capabilities and limitations and evaluate their potential role in medical education and examination preparation.? Objective: This study aimed to assess and compare the performance of 3 LLMs, GPT-4, Bing, and GPT-3.5-Turbo, in the German Medical State Examinations of 2022 and to evaluate their performance relative to that of medical students.? Methods: The LLMs were assessed on a total of 630 questions from the spring and fall German Medical State Examinations of 2022. The performance was evaluated with and without media-related questions. Statistical analyses included 1-way ANOVA and independent samples t tests for pairwise comparisons. The relative strength of the LLMs in comparison with that of the students was also evaluated.? Results: GPT-4 achieved the highest overall performance, correctly answering 88.1\% of questions, closely followed by Bing (86.0\%) and GPT-3.5-Turbo (65.7\%). The students had an average correct answer rate of 74.6\%. Both GPT-4 and Bing significantly outperformed the students in both examinations. When media questions were excluded, Bing achieved the highest performance of 90.7\%, closely followed by GPT-4 (90.4\%), while GPT-3.5-Turbo lagged (68.2\%). There was a significant decline in the performance of GPT-4 and Bing in the fall 2022 examination, which was attributed to a higher proportion of media-related questions and a potential increase in question difficulty.? Conclusions: LLMs, particularly GPT-4 and Bing, demonstrate potential as valuable tools in medical education and for pretesting examination questions. Their high performance, even relative to that of medical students, indicates promising avenues for further development and integration into the educational and clinical landscape.? ",
doi="10.2196/46482",
url="https://mededu.jmir.org/2023/1/e46482",
url="http://www.ncbi.nlm.nih.gov/pubmed/37665620"
}


@Article{info:doi/10.2196/51494,
author="Leung, I. Tiffany
and Sagar, Ankita
and Shroff, Swati
and Henry, L. Tracey",
title="Can AI Mitigate Bias in Writing Letters of Recommendation?",
journal="JMIR Med Educ",
year="2023",
month="Aug",
day="23",
volume="9",
pages="e51494",
keywords="sponsorship",
keywords="implicit bias",
keywords="gender bias",
keywords="bias",
keywords="letters of recommendation",
keywords="artificial intelligence",
keywords="large language models",
keywords="medical education",
keywords="career advancement",
keywords="tenure and promotion",
keywords="promotion",
keywords="leadership",
doi="10.2196/51494",
url="https://mededu.jmir.org/2023/1/e51494",
url="http://www.ncbi.nlm.nih.gov/pubmed/37610808"
}


@Article{info:doi/10.2196/48433,
author="Hsu, Hsing-Yu
and Hsu, Kai-Cheng
and Hou, Shih-Yen
and Wu, Ching-Lung
and Hsieh, Yow-Wen
and Cheng, Yih-Dih",
title="Examining Real-World Medication Consultations and Drug-Herb Interactions: ChatGPT Performance Evaluation",
journal="JMIR Med Educ",
year="2023",
month="Aug",
day="21",
volume="9",
pages="e48433",
keywords="ChatGPT",
keywords="large language model",
keywords="natural language processing",
keywords="real-world medication consultation questions",
keywords="NLP",
keywords="drug-herb interactions",
keywords="pharmacist",
keywords="LLM",
keywords="language models",
keywords="chat generative pre-trained transformer",
abstract="Background: Since OpenAI released ChatGPT, with its strong capability in handling natural tasks and its user-friendly interface, it has garnered significant attention. Objective: A prospective analysis is required to evaluate the accuracy and appropriateness of medication consultation responses generated by ChatGPT. Methods: A prospective cross-sectional study was conducted by the pharmacy department of a medical center in Taiwan. The test data set comprised retrospective medication consultation questions collected from February 1, 2023, to February 28, 2023, along with common questions about drug-herb interactions. Two distinct sets of questions were tested: real-world medication consultation questions and common questions about interactions between traditional Chinese and Western medicines. We used the conventional double-review mechanism. The appropriateness of each response from ChatGPT was assessed by 2 experienced pharmacists. In the event of a discrepancy between the assessments, a third pharmacist stepped in to make the final decision. Results: Of 293 real-world medication consultation questions, a random selection of 80 was used to evaluate ChatGPT's performance. ChatGPT exhibited a higher appropriateness rate in responding to public medication consultation questions compared to those asked by health care providers in a hospital setting (31/51, 61\% vs 20/51, 39\%; P=.01). Conclusions: The findings from this study suggest that ChatGPT could potentially be used for answering basic medication consultation questions. Our analysis of the erroneous information allowed us to identify potential medical risks associated with certain questions; this problem deserves our close attention. ",
doi="10.2196/48433",
url="https://mededu.jmir.org/2023/1/e48433",
url="http://www.ncbi.nlm.nih.gov/pubmed/37561097"
}


@Article{info:doi/10.2196/47427,
author="Lee, Hyeonhoon",
title="Using ChatGPT as a Learning Tool in Acupuncture Education: Comparative Study",
journal="JMIR Med Educ",
year="2023",
month="Aug",
day="17",
volume="9",
pages="e47427",
keywords="ChatGPT",
keywords="educational tool",
keywords="artificial intelligence",
keywords="acupuncture",
keywords="AI",
keywords="personalized education",
keywords="students",
abstract="Background: ChatGPT (Open AI) is a state-of-the-art artificial intelligence model with potential applications in the medical fields of clinical practice, research, and education. Objective: This study aimed to evaluate the potential of ChatGPT as an educational tool in college acupuncture programs, focusing on its ability to support students in learning acupuncture point selection, treatment planning, and decision-making. Methods: We collected case studies published in Acupuncture in Medicine between June 2022 and May 2023. Both ChatGPT-3.5 and ChatGPT-4 were used to generate suggestions for acupuncture points based on case presentations. A Wilcoxon signed-rank test was conducted to compare the number of acupuncture points generated by ChatGPT-3.5 and ChatGPT-4, and the overlapping ratio of acupuncture points was calculated. Results: Among the 21 case studies, 14 studies were included for analysis. ChatGPT-4 generated significantly more acupuncture points (9.0, SD 1.1) compared to ChatGPT-3.5 (5.6, SD 0.6; P<.001). The overlapping ratios of acupuncture points for ChatGPT-3.5 (0.40, SD 0.28) and ChatGPT-4 (0.34, SD 0.27; P=.67) were not significantly different. Conclusions: ChatGPT may be a useful educational tool for acupuncture students, providing valuable insights into personalized treatment plans. However, it cannot fully replace traditional diagnostic methods, and further studies are needed to ensure its safe and effective implementation in acupuncture education. ",
doi="10.2196/47427",
url="https://mededu.jmir.org/2023/1/e47427",
url="http://www.ncbi.nlm.nih.gov/pubmed/37590034"
}


@Article{info:doi/10.2196/48978,
author="Borchert, J. Robin
and Hickman, R. Charlotte
and Pepys, Jack
and Sadler, J. Timothy",
title="Performance of ChatGPT on the Situational Judgement Test---A Professional Dilemmas--Based Examination for Doctors in the United Kingdom",
journal="JMIR Med Educ",
year="2023",
month="Aug",
day="7",
volume="9",
pages="e48978",
keywords="ChatGPT",
keywords="language models",
keywords="Situational Judgement Test",
keywords="medical education",
keywords="artificial intelligence",
keywords="language model",
keywords="exam",
keywords="examination",
keywords="SJT",
keywords="judgement",
keywords="reasoning",
keywords="communication",
keywords="chatbot",
abstract="Background: ChatGPT is a large language model that has performed well on professional examinations in the fields of medicine, law, and business. However, it is unclear how ChatGPT would perform on an examination assessing professionalism and situational judgement for doctors. Objective: We evaluated the performance of ChatGPT on the Situational Judgement Test (SJT): a national examination taken by all final-year medical students in the United Kingdom. This examination is designed to assess attributes such as communication, teamwork, patient safety, prioritization skills, professionalism, and ethics. Methods: All questions from the UK Foundation Programme Office's (UKFPO's) 2023 SJT practice examination were inputted into ChatGPT. For each question, ChatGPT's answers and rationales were recorded and assessed on the basis of the official UK Foundation Programme Office scoring template. Questions were categorized into domains of Good Medical Practice on the basis of the domains referenced in the rationales provided in the scoring sheet. Questions without clear domain links were screened by reviewers and assigned one or multiple domains. ChatGPT's overall performance, as well as its performance across the domains of Good Medical Practice, was evaluated. Results: Overall, ChatGPT performed well, scoring 76\% on the SJT but scoring full marks on only a few questions (9\%), which may reflect possible flaws in ChatGPT's situational judgement or inconsistencies in the reasoning across questions (or both) in the examination itself. ChatGPT demonstrated consistent performance across the 4 outlined domains in Good Medical Practice for doctors. Conclusions: Further research is needed to understand the potential applications of large language models, such as ChatGPT, in medical education for standardizing questions and providing consistent rationales for examinations assessing professionalism and ethics. ",
doi="10.2196/48978",
url="https://mededu.jmir.org/2023/1/e48978",
url="http://www.ncbi.nlm.nih.gov/pubmed/37548997"
}


@Article{info:doi/10.2196/50336,
author="Gilson, Aidan
and Safranek, W. Conrad
and Huang, Thomas
and Socrates, Vimig
and Chi, Ling
and Taylor, Andrew Richard
and Chartash, David",
title="Authors' Reply to: Variability in Large Language Models' Responses to Medical Licensing and Certification Examinations",
journal="JMIR Med Educ",
year="2023",
month="Jul",
day="13",
volume="9",
pages="e50336",
keywords="natural language processing",
keywords="NLP",
keywords="MedQA",
keywords="generative pre-trained transformer",
keywords="GPT",
keywords="medical education",
keywords="chatbot",
keywords="artificial intelligence",
keywords="AI",
keywords="education technology",
keywords="ChatGPT",
keywords="conversational agent",
keywords="machine learning",
keywords="large language models",
keywords="knowledge assessment",
doi="10.2196/50336",
url="https://mededu.jmir.org/2023/1/e50336",
url="http://www.ncbi.nlm.nih.gov/pubmed/37440299"
}


@Article{info:doi/10.2196/48305,
author="Epstein, H. Richard
and Dexter, Franklin",
title="Variability in Large Language Models' Responses to Medical Licensing and Certification Examinations. Comment on ``How Does ChatGPT Perform on the United States Medical Licensing Examination? The Implications of Large Language Models for Medical Education and Knowledge Assessment''",
journal="JMIR Med Educ",
year="2023",
month="Jul",
day="13",
volume="9",
pages="e48305",
keywords="natural language processing",
keywords="NLP",
keywords="MedQA",
keywords="generative pre-trained transformer",
keywords="GPT",
keywords="medical education",
keywords="chatbot",
keywords="artificial intelligence",
keywords="AI",
keywords="education technology",
keywords="ChatGPT",
keywords="Google Bard",
keywords="conversational agent",
keywords="machine learning",
keywords="large language models",
keywords="knowledge assessment",
doi="10.2196/48305",
url="https://mededu.jmir.org/2023/1/e48305",
url="http://www.ncbi.nlm.nih.gov/pubmed/37440293"
}


@Article{info:doi/10.2196/46344,
author="Seth, Puneet
and Hueppchen, Nancy
and Miller, D. Steven
and Rudzicz, Frank
and Ding, Jerry
and Parakh, Kapil
and Record, D. Janet",
title="Data Science as a Core Competency in Undergraduate Medical Education in the Age of Artificial Intelligence in Health Care",
journal="JMIR Med Educ",
year="2023",
month="Jul",
day="11",
volume="9",
pages="e46344",
keywords="data science",
keywords="medical education",
keywords="machine learning",
keywords="health data",
keywords="artificial intelligence",
keywords="AI",
keywords="application",
keywords="health care delivery",
keywords="health care",
keywords="develop",
keywords="medical educators",
keywords="physician",
keywords="education",
keywords="training",
keywords="barriers",
keywords="optimize",
keywords="integration",
keywords="competency",
doi="10.2196/46344",
url="https://mededu.jmir.org/2023/1/e46344",
url="http://www.ncbi.nlm.nih.gov/pubmed/37432728"
}


@Article{info:doi/10.2196/46939,
author="Nov, Oded
and Singh, Nina
and Mann, Devin",
title="Putting ChatGPT's Medical Advice to the (Turing) Test: Survey Study",
journal="JMIR Med Educ",
year="2023",
month="Jul",
day="10",
volume="9",
pages="e46939",
keywords="artificial intelligence",
keywords="AI",
keywords="ChatGPT",
keywords="large language model",
keywords="patient-provider interaction",
keywords="chatbot",
keywords="feasibility",
keywords="ethics",
keywords="privacy",
keywords="language model",
keywords="machine learning",
abstract="Background: Chatbots are being piloted to draft responses to patient questions, but patients' ability to distinguish between provider and chatbot responses and patients' trust in chatbots' functions are not well established. Objective: This study aimed to assess the feasibility of using ChatGPT (Chat Generative Pre-trained Transformer) or a similar artificial intelligence--based chatbot for patient-provider communication. Methods: A survey study was conducted in January 2023. Ten representative, nonadministrative patient-provider interactions were extracted from the electronic health record. Patients' questions were entered into ChatGPT with a request for the chatbot to respond using approximately the same word count as the human provider's response. In the survey, each patient question was followed by a provider- or ChatGPT-generated response. Participants were informed that 5 responses were provider generated and 5 were chatbot generated. Participants were asked---and incentivized financially---to correctly identify the response source. Participants were also asked about their trust in chatbots' functions in patient-provider communication, using a Likert scale from 1-5. Results: A US-representative sample of 430 study participants aged 18 and older were recruited on Prolific, a crowdsourcing platform for academic studies. In all, 426 participants filled out the full survey. After removing participants who spent less than 3 minutes on the survey, 392 respondents remained. Overall, 53.3\% (209/392) of respondents analyzed were women, and the average age was 47.1 (range 18-91) years. The correct classification of responses ranged between 49\% (192/392) to 85.7\% (336/392) for different questions. On average, chatbot responses were identified correctly in 65.5\% (1284/1960) of the cases, and human provider responses were identified correctly in 65.1\% (1276/1960) of the cases. On average, responses toward patients' trust in chatbots' functions were weakly positive (mean Likert score 3.4 out of 5), with lower trust as the health-related complexity of the task in the questions increased. Conclusions: ChatGPT responses to patient questions were weakly distinguishable from provider responses. Laypeople appear to trust the use of chatbots to answer lower-risk health questions. It is important to continue studying patient-chatbot interaction as chatbots move from administrative to more clinical roles in health care. ",
doi="10.2196/46939",
url="https://mededu.jmir.org/2023/1/e46939",
url="http://www.ncbi.nlm.nih.gov/pubmed/37428540"
}


@Article{info:doi/10.2196/48002,
author="Takagi, Soshi
and Watari, Takashi
and Erabi, Ayano
and Sakaguchi, Kota",
title="Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: Comparison Study",
journal="JMIR Med Educ",
year="2023",
month="Jun",
day="29",
volume="9",
pages="e48002",
keywords="ChatGPT",
keywords="Chat Generative Pre-trained Transformer",
keywords="GPT-4",
keywords="Generative Pre-trained Transformer 4",
keywords="artificial intelligence",
keywords="AI",
keywords="medical education",
keywords="Japanese Medical Licensing Examination",
keywords="medical licensing",
keywords="clinical support",
keywords="learning model",
abstract="Background: The competence of ChatGPT (Chat Generative Pre-Trained Transformer) in non-English languages is not well studied. Objective: This study compared the performances of GPT-3.5 (Generative Pre-trained Transformer) and GPT-4 on the Japanese Medical Licensing Examination (JMLE) to evaluate the reliability of these models for clinical reasoning and medical knowledge in non-English languages. Methods: This study used the default mode of ChatGPT, which is based on GPT-3.5; the GPT-4 model of ChatGPT Plus; and the 117th JMLE in 2023. A total of 254 questions were included in the final analysis, which were categorized into 3 types, namely general, clinical, and clinical sentence questions. Results: The results indicated that GPT-4 outperformed GPT-3.5 in terms of accuracy, particularly for general, clinical, and clinical sentence questions. GPT-4 also performed better on difficult questions and specific disease questions. Furthermore, GPT-4 achieved the passing criteria for the JMLE, indicating its reliability for clinical reasoning and medical knowledge in non-English languages. Conclusions: GPT-4 could become a valuable tool for medical education and clinical support in non--English-speaking regions, such as Japan. ",
doi="10.2196/48002",
url="https://mededu.jmir.org/2023/1/e48002",
url="http://www.ncbi.nlm.nih.gov/pubmed/37384388"
}


@Article{info:doi/10.2196/48163,
author="Karabacak, Mert
and Ozkara, Berksu Burak
and Margetis, Konstantinos
and Wintermark, Max
and Bisdas, Sotirios",
title="The Advent of Generative Language Models in Medical Education",
journal="JMIR Med Educ",
year="2023",
month="Jun",
day="6",
volume="9",
pages="e48163",
keywords="generative language model",
keywords="artificial intelligence",
keywords="medical education",
keywords="ChatGPT",
keywords="academic integrity",
keywords="AI-driven feedback",
keywords="stimulation",
keywords="evaluation",
keywords="technology",
keywords="learning environment",
keywords="medical student",
doi="10.2196/48163",
url="https://mededu.jmir.org/2023/1/e48163",
url="http://www.ncbi.nlm.nih.gov/pubmed/37279048"
}


@Article{info:doi/10.2196/48291,
author="Abd-alrazaq, Alaa
and AlSaad, Rawan
and Alhuwail, Dari
and Ahmed, Arfan
and Healy, Mark Padraig
and Latifi, Syed
and Aziz, Sarah
and Damseh, Rafat
and Alabed Alrazak, Sadam
and Sheikh, Javaid",
title="Large Language Models in Medical Education: Opportunities, Challenges, and Future Directions",
journal="JMIR Med Educ",
year="2023",
month="Jun",
day="1",
volume="9",
pages="e48291",
keywords="large language models",
keywords="artificial intelligence",
keywords="medical education",
keywords="ChatGPT",
keywords="GPT-4",
keywords="generative AI",
keywords="students",
keywords="educators",
doi="10.2196/48291",
url="https://mededu.jmir.org/2023/1/e48291",
url="http://www.ncbi.nlm.nih.gov/pubmed/37261894"
}


@Article{info:doi/10.2196/47737,
author="Giannos, Panagiotis
and Delardas, Orestis",
title="Performance of ChatGPT on UK Standardized Admission Tests: Insights From the BMAT, TMUA, LNAT, and TSA Examinations",
journal="JMIR Med Educ",
year="2023",
month="Apr",
day="26",
volume="9",
pages="e47737",
keywords="standardized admissions tests",
keywords="GPT",
keywords="ChatGPT",
keywords="medical education",
keywords="medicine",
keywords="law",
keywords="natural language processing",
keywords="BMAT",
keywords="TMUA",
keywords="LNAT",
keywords="TSA",
abstract="Background: Large language models, such as ChatGPT by OpenAI, have demonstrated potential in various applications, including medical education. Previous studies have assessed ChatGPT's performance in university or professional settings. However, the model's potential in the context of standardized admission tests remains unexplored. Objective: This study evaluated ChatGPT's performance on standardized admission tests in the United Kingdom, including the BioMedical Admissions Test (BMAT), Test of Mathematics for University Admission (TMUA), Law National Aptitude Test (LNAT), and Thinking Skills Assessment (TSA), to understand its potential as an innovative tool for education and test preparation. Methods: Recent public resources (2019-2022) were used to compile a data set of 509 questions from the BMAT, TMUA, LNAT, and TSA covering diverse topics in aptitude, scientific knowledge and applications, mathematical thinking and reasoning, critical thinking, problem-solving, reading comprehension, and logical reasoning. This evaluation assessed ChatGPT's performance using the legacy GPT-3.5 model, focusing on multiple-choice questions for consistency. The model's performance was analyzed based on question difficulty, the proportion of correct responses when aggregating exams from all years, and a comparison of test scores between papers of the same exam using binomial distribution and paired-sample (2-tailed) t tests. Results: The proportion of correct responses was significantly lower than incorrect ones in BMAT section 2 (P<.001) and TMUA paper 1 (P<.001) and paper 2 (P<.001). No significant differences were observed in BMAT section 1 (P=.2), TSA section 1 (P=.7), or LNAT papers 1 and 2, section A (P=.3). ChatGPT performed better in BMAT section 1 than section 2 (P=.047), with a maximum candidate ranking of 73\% compared to a minimum of 1\%. In the TMUA, it engaged with questions but had limited accuracy and no performance difference between papers (P=.6), with candidate rankings below 10\%. In the LNAT, it demonstrated moderate success, especially in paper 2's questions; however, student performance data were unavailable. TSA performance varied across years with generally moderate results and fluctuating candidate rankings. Similar trends were observed for easy to moderate difficulty questions (BMAT section 1, P=.3; BMAT section 2, P=.04; TMUA paper 1, P<.001; TMUA paper 2, P=.003; TSA section 1, P=.8; and LNAT papers 1 and 2, section A, P>.99) and hard to challenging ones (BMAT section 1, P=.7; BMAT section 2, P<.001; TMUA paper 1, P=.007; TMUA paper 2, P<.001; TSA section 1, P=.3; and LNAT papers 1 and 2, section A, P=.2). Conclusions: ChatGPT shows promise as a supplementary tool for subject areas and test formats that assess aptitude, problem-solving and critical thinking, and reading comprehension. However, its limitations in areas such as scientific and mathematical knowledge and applications highlight the need for continuous development and integration with conventional learning strategies in order to fully harness its potential. ",
doi="10.2196/47737",
url="https://mededu.jmir.org/2023/1/e47737",
url="http://www.ncbi.nlm.nih.gov/pubmed/37099373"
}


@Article{info:doi/10.2196/46599,
author="Thirunavukarasu, James Arun
and Hassan, Refaat
and Mahmood, Shathar
and Sanghera, Rohan
and Barzangi, Kara
and El Mukashfi, Mohanned
and Shah, Sachin",
title="Trialling a Large Language Model (ChatGPT) in General Practice With the Applied Knowledge Test: Observational Study Demonstrating Opportunities and Limitations in Primary Care",
journal="JMIR Med Educ",
year="2023",
month="Apr",
day="21",
volume="9",
pages="e46599",
keywords="ChatGPT",
keywords="large language model",
keywords="natural language processing",
keywords="decision support techniques",
keywords="artificial intelligence",
keywords="AI",
keywords="deep learning",
keywords="primary care",
keywords="general practice",
keywords="family medicine",
keywords="chatbot",
abstract="Background: Large language models exhibiting human-level performance in specialized tasks are emerging; examples include Generative Pretrained Transformer 3.5, which underlies the processing of ChatGPT. Rigorous trials are required to understand the capabilities of emerging technology, so that innovation can be directed to benefit patients and practitioners. Objective: Here, we evaluated the strengths and weaknesses of ChatGPT in primary care using the Membership of the Royal College of General Practitioners Applied Knowledge Test (AKT) as a medium. Methods: AKT questions were sourced from a web-based question bank and 2 AKT practice papers. In total, 674 unique AKT questions were inputted to ChatGPT, with the model's answers recorded and compared to correct answers provided by the Royal College of General Practitioners. Each question was inputted twice in separate ChatGPT sessions, with answers on repeated trials compared to gauge consistency. Subject difficulty was gauged by referring to examiners' reports from 2018 to 2022. Novel explanations from ChatGPT---defined as information provided that was not inputted within the question or multiple answer choices---were recorded. Performance was analyzed with respect to subject, difficulty, question source, and novel model outputs to explore ChatGPT's strengths and weaknesses. Results: Average overall performance of ChatGPT was 60.17\%, which is below the mean passing mark in the last 2 years (70.42\%). Accuracy differed between sources (P=.04 and .06). ChatGPT's performance varied with subject category (P=.02 and .02), but variation did not correlate with difficulty (Spearman $\rho$=--0.241 and --0.238; P=.19 and .20). The proclivity of ChatGPT to provide novel explanations did not affect accuracy (P>.99 and .23). Conclusions: Large language models are approaching human expert--level performance, although further development is required to match the performance of qualified primary care physicians in the AKT. Validated high-performance models may serve as assistants or autonomous clinical tools to ameliorate the general practice workforce crisis. ",
doi="10.2196/46599",
url="https://mededu.jmir.org/2023/1/e46599",
url="http://www.ncbi.nlm.nih.gov/pubmed/37083633"
}


@Article{info:doi/10.2196/43110,
author="Adams, C. Lisa
and Busch, Felix
and Truhn, Daniel
and Makowski, R. Marcus
and Aerts, L. Hugo J. W.
and Bressem, K. Keno",
title="What Does DALL-E 2 Know About Radiology?",
journal="J Med Internet Res",
year="2023",
month="Mar",
day="16",
volume="25",
pages="e43110",
keywords="DALL-E",
keywords="creating images from text",
keywords="image creation",
keywords="image generation",
keywords="transformer language model",
keywords="machine learning",
keywords="generative model",
keywords="radiology",
keywords="x-ray",
keywords="artificial intelligence",
keywords="medical imaging",
keywords="text-to-image",
keywords="diagnostic imaging",
doi="10.2196/43110",
url="https://www.jmir.org/2023/1/e43110",
url="http://www.ncbi.nlm.nih.gov/pubmed/36927634"
}


@Article{info:doi/10.2196/46876,
author="Sabry Abdel-Messih, Mary
and Kamel Boulos, N. Maged",
title="ChatGPT in Clinical Toxicology",
journal="JMIR Med Educ",
year="2023",
month="Mar",
day="8",
volume="9",
pages="e46876",
keywords="ChatGPT",
keywords="clinical toxicology",
keywords="organophosphates",
keywords="artificial intelligence",
keywords="AI",
keywords="medical education",
doi="10.2196/46876",
url="https://mededu.jmir.org/2023/1/e46876",
url="http://www.ncbi.nlm.nih.gov/pubmed/36867743"
}


@Article{info:doi/10.2196/46885,
author="Eysenbach, Gunther",
title="The Role of ChatGPT, Generative Language Models, and Artificial Intelligence in Medical Education: A Conversation With ChatGPT and a Call for Papers",
journal="JMIR Med Educ",
year="2023",
month="Mar",
day="6",
volume="9",
pages="e46885",
keywords="artificial intelligence",
keywords="AI",
keywords="ChatGPT",
keywords="generative language model",
keywords="medical education",
keywords="interview",
keywords="future of education",
doi="10.2196/46885",
url="https://mededu.jmir.org/2023/1/e46885",
url="http://www.ncbi.nlm.nih.gov/pubmed/36863937"
}


@Article{info:doi/10.2196/45312,
author="Gilson, Aidan
and Safranek, W. Conrad
and Huang, Thomas
and Socrates, Vimig
and Chi, Ling
and Taylor, Andrew Richard
and Chartash, David",
title="How Does ChatGPT Perform on the United States Medical Licensing Examination (USMLE)? The Implications of Large Language Models for Medical Education and Knowledge Assessment",
journal="JMIR Med Educ",
year="2023",
month="Feb",
day="8",
volume="9",
pages="e45312",
keywords="natural language processing",
keywords="NLP",
keywords="MedQA",
keywords="generative pre-trained transformer",
keywords="GPT",
keywords="medical education",
keywords="chatbot",
keywords="artificial intelligence",
keywords="education technology",
keywords="ChatGPT",
keywords="conversational agent",
keywords="machine learning",
keywords="USMLE",
abstract="Background: Chat Generative Pre-trained Transformer (ChatGPT) is a 175-billion-parameter natural language processing model that can generate conversation-style responses to user input. Objective: This study aimed to evaluate the performance of ChatGPT on questions within the scope of the United States Medical Licensing Examination (USMLE) Step 1 and Step 2 exams, as well as to analyze responses for user interpretability. Methods: We used 2 sets of multiple-choice questions to evaluate ChatGPT's performance, each with questions pertaining to Step 1 and Step 2. The first set was derived from AMBOSS, a commonly used question bank for medical students, which also provides statistics on question difficulty and the performance on an exam relative to the user base. The second set was the National Board of Medical Examiners (NBME) free 120 questions. ChatGPT's performance was compared to 2 other large language models, GPT-3 and InstructGPT. The text output of each ChatGPT response was evaluated across 3 qualitative metrics: logical justification of the answer selected, presence of information internal to the question, and presence of information external to the question. Results: Of the 4 data sets, AMBOSS-Step1, AMBOSS-Step2, NBME-Free-Step1, and NBME-Free-Step2, ChatGPT achieved accuracies of 44\% (44/100), 42\% (42/100), 64.4\% (56/87), and 57.8\% (59/102), respectively. ChatGPT outperformed InstructGPT by 8.15\% on average across all data sets, and GPT-3 performed similarly to random chance. The model demonstrated a significant decrease in performance as question difficulty increased (P=.01) within the AMBOSS-Step1 data set. We found that logical justification for ChatGPT's answer selection was present in 100\% of outputs of the NBME data sets. Internal information to the question was present in 96.8\% (183/189) of all questions. The presence of information external to the question was 44.5\% and 27\% lower for incorrect answers relative to correct answers on the NBME-Free-Step1 (P<.001) and NBME-Free-Step2 (P=.001) data sets, respectively. Conclusions: ChatGPT marks a significant improvement in natural language processing models on the tasks of medical question answering. By performing at a greater than 60\% threshold on the NBME-Free-Step-1 data set, we show that the model achieves the equivalent of a passing score for a third-year medical student. Additionally, we highlight ChatGPT's capacity to provide logic and informational context across the majority of answers. These facts taken together make a compelling case for the potential applications of ChatGPT as an interactive medical education tool to support learning. ",
doi="10.2196/45312",
url="https://mededu.jmir.org/2023/1/e45312",
url="http://www.ncbi.nlm.nih.gov/pubmed/36753318"
}