<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e69521</article-id><article-id pub-id-type="doi">10.2196/69521</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Chatbots&#x2019; Role in Generating Single Best Answer Questions for Undergraduate Medical Student Assessment: Comparative Analysis</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Abouzeid</surname><given-names>Enjy</given-names></name><degrees>MBChB, MSc, PhD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Wassef</surname><given-names>Rita</given-names></name><degrees>MBBCh, MSc, MD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Jawwad</surname><given-names>Ayesha</given-names></name><degrees>BDS, MPH</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Harris</surname><given-names>Patricia</given-names></name><degrees>BSc(Hons), PhD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>School of Medicine, University of Ulster</institution><addr-line>Northland Road</addr-line><addr-line>Derry-Londonderry</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Bahattab</surname><given-names>Awsan</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Lesselroth</surname><given-names>Blake</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hin Lai</surname><given-names>U</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kiyak</surname><given-names>Yavuz Selim</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Enjy Abouzeid, MBChB, MSc, PhD, School of Medicine, University of Ulster, Northland Rd, Derry-Londonderry, BT48 7JL, United Kingdom, 44 7516989748; <email>e.abouzeid@ulster.ac.uk</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>30</day><month>5</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e69521</elocation-id><history><date date-type="received"><day>02</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>22</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>30</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Enjy Abouzeid, Rita Wassef, Ayesha Jawwad, Patricia Harris. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 30.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2025/1/e69521"/><abstract><sec><title>Background</title><p>Programmatic assessment supports flexible learning and individual progression but challenges educators to develop frequent assessments reflecting different competencies. The continuous creation of large volumes of assessment items, in a consistent format and comparatively restricted time, is laborious. The application of technological innovations, including artificial intelligence (AI), has been tried to address this challenge. A major concern raised is the validity of the information produced by AI tools, and if not properly verified, it can produce inaccurate and therefore inappropriate assessments.</p></sec><sec><title>Objective</title><p>This study was designed to examine the content validity and consistency of different AI chatbots in creating single best answer (SBA) questions, a refined format of multiple choice questions better suited to assess higher levels of knowledge, for undergraduate medical students.</p></sec><sec sec-type="methods"><title>Methods</title><p>This study followed 3 steps. First, 3 researchers used a unified prompt script to generate 10 SBA questions across 4 chatbot platforms. Second, assessors evaluated the chatbot outputs for consistency by identifying similarities and differences between users and across chatbots. With 3 assessors and 10 learning objectives, the maximum possible score for any individual chatbot was 30. Third, 7 assessors internally moderated the questions using a rating scale developed by the research team to evaluate scientific accuracy and educational quality.</p></sec><sec sec-type="results"><title>Results</title><p>In response to the prompts, all chatbots generated 10 questions each, except Bing, which failed to respond to 1 prompt. ChatGPT-4 exhibited the highest variation in question generation but did not fully satisfy the &#x201C;cover test.&#x201D; Gemini performed well across most evaluation criteria, except for item balance, and relied heavily on the vignette for answers but showed a preference for one answer option. Bing scored low in most evaluation areas but generated appropriately structured lead-in questions. SBA questions from GPT-3.5, Gemini, and ChatGPT-4 had similar Item Content Validity Index and Scale Level Content Validity Index values, while the Krippendorff alpha coefficient was low (0.016). Bing performed poorly in content clarity, overall validity, and item construction accuracy. A 2-way ANOVA without replication revealed statistically significant differences among chatbots and domains (<italic>P</italic>&#x003C;.05). However, the Tukey-Kramer HSD (honestly significant difference) post hoc test showed no significant pairwise differences between individual chatbots, as all comparisons had <italic>P</italic> values &#x003E;.05 and overlapping CIs.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>AI chatbots can aid the production of questions aligned with learning objectives, and individual chatbots have their own strengths and weaknesses. Nevertheless, all require expert evaluation to ensure their suitability for use. Using AI to generate SBA prompts us to reconsider Bloom&#x2019;s taxonomy of the cognitive domain, which traditionally positions creation as the highest level of cognition.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>assessment</kwd><kwd>Bing</kwd><kwd>ChatGPT</kwd><kwd>Gemini</kwd><kwd>medical education</kwd><kwd>single best answer</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Across disciplines of education, including medical education, programmatic assessment offers flexible learning modalities that pave the road for individual progression. However, it represents a challenge to educators, as they are required to develop frequent assessments that reflect different competencies, thus necessitating the continuous creation of examination content in a comparatively restricted time [<xref ref-type="bibr" rid="ref1">1</xref>]. For many years, multiple choice questions (MCQs) have been adopted in medical education for assessing knowledge and clinical reasoning skills in high-stakes undergraduate and postgraduate medical exams. MCQs are reliable, objective, standardized, equitable, and efficient formats for testing large volumes of content in a limited time. A main problem with MCQs is that producing high-quality questions is time-consuming, from drafting the question that includes a clinical vignette or stem, a lead-in question, a correct answer, and distractors to validation of content and detection of potential flaws [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. To tackle this dilemma, the application of many technological innovations, including artificial intelligence (AI), has been tried [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>AI refers to machines mimicking the human brain in performing intellectual tasks. This originates from the imitation game developed by the British mathematician Alan Turing, who posed the universally famous question &#x201C;Can machines think?&#x201D; [<xref ref-type="bibr" rid="ref4">4</xref>]. Since then, many AI research laboratories have invested time, effort, and money to answer this question. One particular AI research laboratory known as OpenAI, based in California, United States, has revolutionized our world at the end of 2022 by launching an AI-based large language model (LLM) software (GPT-3.5) that uses natural language processing to engage in human-like conversations and making it freely available for the public [<xref ref-type="bibr" rid="ref5">5</xref>]. Within a few weeks after its release, the OpenAI chatbot, known as ChatGPT, had gained much attention in many fields, including medical education. It became the fastest-growing app of all time with more than 120 million users in just a few months after its launch [<xref ref-type="bibr" rid="ref6">6</xref>]. This led competitors to develop and launch other chatbots. Microsoft launched Bing Chat AI in February 2023, followed by Google releasing Gemini in March 2023 [<xref ref-type="bibr" rid="ref7">7</xref>]. A newer, improved version of ChatGPT (ChatGPT Plus), which uses the GPT-4 Turbo language model, has been developed by OpenAI and launched as a paid subscription version by the end of 2023 [<xref ref-type="bibr" rid="ref6">6</xref>]</p><p>In terms of assessment in medical education, ChatGPT has been the most extensively studied chatbot. It was found to be able to quickly and accurately apply known concepts in medicine to novel problems, including reflection prompts and examination questions, and to mimic human writing styles, introducing a potential threat to the validity of traditional forms of medical student assessment including short answer assessment [<xref ref-type="bibr" rid="ref8">8</xref>], it even successfully passed the USMLE (United States Medical Licensing Examination) [<xref ref-type="bibr" rid="ref9">9</xref>]. Similarly, ChatGPT-4 was able to achieve a mean of more than 75% in the newly derived undergraduate medical exit examination: UKMLA (United Kingdom Medical Licensing Assessment) [<xref ref-type="bibr" rid="ref10">10</xref>]. Its application has been described across multiple areas of academic assessment, for example, developing innovative assessments, grading submitted work, and providing feedback [<xref ref-type="bibr" rid="ref11">11</xref>]. Nevertheless, concerns persist around the validity of the information provided by all AI tools. Sample [<xref ref-type="bibr" rid="ref12">12</xref>] argued that if the chatbot response is not properly verified, it can be misleading and result in &#x201C;junk science.&#x201D;</p><p>Additionally, the broad availability of LLMs such as ChatGPT, Gemini, and Bing has facilitated extensive comparative studies across various domains. For example, 1 study evaluated these models using case vignettes in physiology and found that ChatGPT-3.5 outperformed Bing and Google Bard (an old version of Gemini), indicating its superior effectiveness in case-based learning [<xref ref-type="bibr" rid="ref13">13</xref>]. Another study, using the clinicopathological conferences method, compared the ability of AI chatbots to infer neuropathological diagnoses from clinical summaries. The findings revealed that Google Bard and ChatGPT-3.5 correctly diagnosed 76% of cases, while ChatGPT-4 achieved a higher accuracy rate, correctly identifying 84% of cases [<xref ref-type="bibr" rid="ref14">14</xref>]. Similarly, a comparison of ChatGPT-3.5, Google Bard, and Microsoft Bing in hematology cases highlighted significant performance differences, with ChatGPT achieving the highest accuracy [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Recent studies have explored the use of AI in generating MCQs and single best answer (SBA) questions for medical examinations, highlighting its potential applications and limitations. For instance, Zuckerman et al [<xref ref-type="bibr" rid="ref16">16</xref>] examined ChatGPT&#x2019;s role in assessment writing, while K&#x0131;yak et al [<xref ref-type="bibr" rid="ref17">17</xref>] and Mistry et al [<xref ref-type="bibr" rid="ref18">18</xref>] investigated AI-generated MCQs in pharmacotherapy and radiology board exams, respectively.</p><p>Despite these contributions, the ability of AI to generate valid SBA questions, an assessment format that better evaluates higher-order cognitive skills such as data interpretation, problem-solving, and decision-making [<xref ref-type="bibr" rid="ref19">19</xref>], remains an area requiring further exploration. Additionally, a critical consideration is the variation in AI-generated outputs and the potential for examination candidates to predict examination items based on curriculum learning objectives (LOBs). Given the significance of these issues, this study aims to examine the content validity and consistency of different chatbots in generating SBAs for undergraduate medical education.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Context</title><p>The Graduate Entry Medical Programme at Ulster University&#x2019;s School of Medicine is a 4-year program. Similar to most UK medical schools, students undergo assessment through a series of SBA papers comprising over 1500 questions across the program. Managing this extensive assessment requirement has prompted the exploration of innovative solutions to support the assessment team.</p><p>To ensure assessment standards, the school has implemented a rigorous quality assurance process. Questions are first created by designated clinical or academic authors who have been trained and provided with a &#x201C;house style&#x201D; to follow. Questions then undergo internal review by other clinical or academic staff before external review by external examiners to ensure they meet rigorous requirements. Post hoc psychometric analysis of question performance is also used to drive evidence-based review and enhancement. This meticulous review process aims to uphold the integrity and effectiveness of assessments used to make high-stakes progression decisions and forms part of a wider suite of quality processes to deliver against the assessment strategy.</p></sec><sec id="s2-2"><title>Study Design</title><p>This exploratory comparative study was conducted between December 2023 and May 2024; we continued to follow the school&#x2019;s established quality assurance process, but the designated first authors of the questions were AI chatbots. This includes 3 versions of AI chatbots: ChatGPT which will be referred to as ChatGPT-3.5 in this study, Google Gemini, and Microsoft Bing AI, in addition to the subscription-only version of OpenAI: ChatGPT-4 that provides access to GPT-4 Turbo, which is advertised as a more powerful and faster version of GPT-4. During this study, Google changed the name of its platform from Bard to Gemini. For consistency, this paper will refer to the current name: Gemini. <xref ref-type="fig" rid="figure1">Figure 1</xref> depicts the full study design, which included three main phases: (1) Generation of questions using various AI chatbots, (2) Assessment of the consistency of the chatbot outputs, and (3) Evaluation of the quality of the questions generated.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The study design. SBA: single best answer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e69521_fig01.png"/></fig></sec><sec id="s2-3"><title>Generation of Questions Using Various AI Chatbots</title><p>In phase one, the research team randomly selected year 1 curriculum LOBs (n=10) to create SBA questions for. These objectives were selected using stratified random sampling from the official list of LOBs for second-semester educational units. Three researchers were involved, and each one created a new account for each of the 4 chatbot platforms. All researchers used the same predefined prompts (see below) around the same time (end of December 2023) to request 10 questions from each chatbot, one for each LOB. The 10 prompts were entered one by one in the same conversation with each chatbot. All the questions were compiled into a shared Microsoft Excel (Microsoft Corp) spreadsheet for analysis in steps 2 and 3.</p><p>To allow a fair comparison, the same prompt was used in each chatbot, which specified SBA features:</p><list list-type="bullet"><list-item><p>You are a university lecturer in a UK medical school. Generate an MCQ on &#x201C;the learning objective,&#x201D; with the following criteria:</p><list list-type="bullet"><list-item><p>The question is in a clinical vignette format.</p></list-item><list-item><p>The question is designed to assess the knowledge (&#x00B1;clinical judgment) of undergraduate medical students.</p></list-item><list-item><p>The question meets the standard for a medical graduate examination.</p></list-item><list-item><p>Five choices are allowed for each question.</p></list-item><list-item><p>Only 1 correct answer</p></list-item><list-item><p>Tag the correct answer.</p></list-item><list-item><p>Justify the correct answer.</p></list-item></list></list-item></list></sec><sec id="s2-4"><title>Assessment of the Consistency and Quality (Item Flaws) of the Chatbot Outputs</title><p>In the second phase, researchers involved in the previous step assessed each chatbot&#x2019;s output consistency and technical flaws. Consistency was evaluated based on the similarity between the outputs generated across the 3 researchers, including any bias in the correct answer allocation (eg, favoring option &#x201C;A&#x201D; as the correct answer). Similarity was evaluated based on specific elements of the output and accordingly classified into one of three categories: (1) exact questions: when the outputs contain the same wording, condition, and lead-in question; (2) similar questions: when the outputs share common elements such as patient characteristics, age, condition, presentation, or lead-in question; (3) different questions: when the outputs do not have any content in common.</p><p>Technical item flaws assessed the overall construct and structure of the questions produced by the chatbots using 7 previously published criteria for determining the quality of SBAs [<xref ref-type="bibr" rid="ref20">20</xref>]. The 7 criteria include judgments on whether the questions: follow the SBA structural format, satisfy the &#x201C;cover test&#x201D; rule where the question should be answerable solely from the vignette or stem and lead-in (with the answers &#x201C;covered&#x201D;), test the application of knowledge rather than recall isolated facts, have item balance (which ensures a balance in information between the stem, lead-in, and options), tests 1 idea, are dependent on the vignette to reach the correct answer, and have appropriate lead-ins length. The researchers used a defined scale to evaluate how often or to what extent each criterion was met across the 3 researchers&#x2019; outputs. Each criterion was scored on a scale from 0 to 3 for each of the 10 LOB prompts. In this scale, 0 meant none, 1 meant 1 SBA, 2 meant 2 SBAs, and 3 meant all 3 SBAs, representing the number of questions produced by each chatbot that met the criterion. With 3 assessors and 10 LOBs, the maximum possible score for any individual chatbot was 30.</p></sec><sec id="s2-5"><title>Assessment of the Content Validity and Accuracy of the Questions Generated</title><p>In phase 3, samples of questions generated by the chatbots were distributed to various internal assessors as per our normal quality review process. The questions were selected using stratified random sampling to select 1 of the 3 questions generated by each chatbot for each LOB, yielding a total of 39 questions. Alongside this, a content validation evaluation form, developed by the research team, was used to ensure consistent review between assessors, providing assessors with clear expectations and an understanding of the task. The assessors are faculty members with expertise in the curriculum content. Each question was evaluated by 7 assessors.</p><p>Considering published recommendations for content validation [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], 20 internal assessors were invited, of which 7 consented to participate. The internal assessors critically reviewed the questions based on several criteria to ensure their quality and alignment with educational objectives. This includes content clarity and validity; accuracy of information, answers, and justification; and educational accuracy. Each of these elements was scored on a Likert scale of 1 to 4 (with 1 representing the lowest level of construct and 4 the highest level of the construct; <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>Quantitative data was analyzed through scores obtained from the rating scale using IBM SPSS Statistics (version 26; IBM Corp). Subsequently, 2 content validity indexes were computed: the Item Content Validity Index (I-CVI) and the Scale Level Content Validity Index (S-CVI). Percentages and frequencies were calculated for the questions&#x2019; scores to provide further insights into the data. A 2-way ANOVA without replication was conducted to assess differences in chatbot performance across 6 domains. Post hoc comparisons were performed using the Tukey-Kramer HSD (honestly significant difference) test to identify specific group differences. The average ratings provided by 7 evaluators were used for each chatbot and each criterion. The Krippendorff alpha [<xref ref-type="bibr" rid="ref23">23</xref>] was used to assess interrater reliability, using the K-Alpha Calculator [<xref ref-type="bibr" rid="ref24">24</xref>]. A coefficient value of 0.8 is considered satisfactory [<xref ref-type="bibr" rid="ref23">23</xref>]. However, the low Krippendorff alpha suggested a need for further refinement of the rating scheme or additional training for raters to improve reliability.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>Participants were informed that their responses would be anonymized and that they could withdraw from this study at any point without penalty. Informed consent was obtained from all participants before data collection. Only those who provided explicit consent were included in this study. This study received ethical approval from the Ulster University Centre for Higher Education Research and Practice Ethics Committee and the Learning Enhancement Directorate Ethics Filter Committee (LEDEC; formerly CHERP; LEDEC-24-004). All data were anonymized during the analysis phase to ensure confidentiality and to protect participants&#x2019; identities. Staff members who chose not to participate experienced no disadvantage or impact on their professional standing. No financial or material compensation was offered to participants for their involvement in this research.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Generation of Questions</title><p>In response to the predefined prompts provided to the chatbots, 3 of them (free ChatGPT, ChatGPT Plus, and Gemini) generated 10 questions each, for a total of 30 across the 3 researchers. Bing could not respond to the prompt for LOB9 and thus generated 9 questions, for a total of 27 across the 3 researchers. Thus, 117 questions were generated (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p></sec><sec id="s3-2"><title>Assessment of Consistency Within Chatbots and Technical Item Flaws Among the Outputs</title><p>Consistency within chatbots was evaluated based on the similarity of outputs between the 3 researchers and any bias in the allocation of the correct answer option. Bing had the highest degree of similarity between items generated by multiple users (4 exact question matches and 20 similar ones), while ChatGPT-4 had the highest degree of variation (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Similarity between the questions generated by different chatbots.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Gemini (N=30), n (%)</td><td align="left" valign="bottom">Bing (N=27), n (%)</td><td align="left" valign="bottom">ChatGPT-3.5 (N=30), n (%)</td><td align="left" valign="bottom">ChatGPT-4 (N=30), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Exact questions</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">4 (14.81)</td><td align="left" valign="top">2 (6.67)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Similar questions</td><td align="left" valign="top">24 (80)</td><td align="left" valign="top">20 (74.07)</td><td align="left" valign="top">22 (73.33)</td><td align="left" valign="top">22 (73.33)</td></tr><tr><td align="left" valign="top">Different questions</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">3 (11.11)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">8 (26.67)</td></tr></tbody></table></table-wrap><p>The original predefined prompt did not request answer options to be given in any particular order. Therefore, for assessing potential bias in the correct answer allocation, 3 scenarios were modeled (<xref ref-type="table" rid="table2">Table 2</xref>):</p><list list-type="bullet"><list-item><p>Any bias or preference in the correct answer allocation based on the raw chatbot output.</p></list-item><list-item><p>Any bias or preference in the correct answer allocation based on the chatbot output when the researchers manually ordered answers into alphabetical order.</p></list-item><list-item><p>Any bias or preference in the correct answer allocation based on a new output, where each chatbot was prompted to produce 30 new SBA questions with answers alphabetically.</p></list-item></list><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Assessment of possible bias or preference in correct answer allocation.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="2">Options</td><td align="left" valign="top">Gemini (N=30), n (%)</td><td align="left" valign="top">Bing (N=27), n (%)</td><td align="left" valign="top">ChatGPT-3.5 (N=30), n (%)</td><td align="left" valign="top">ChatGPT-4 (N=30), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">Original chatbot output</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">A</td><td align="left" valign="top">5 (16.67)</td><td align="left" valign="top">6 (22.22)</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">11 (36.67)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">B</td><td align="left" valign="top">12 (40)</td><td align="left" valign="top">4 (14.81)</td><td align="left" valign="top">10 (33.33)</td><td align="left" valign="top">10 (33.33)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">C</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">10 (37.04)</td><td align="left" valign="top">7 (23.33)</td><td align="left" valign="top">4 (13.33)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">D</td><td align="left" valign="top">5 (16.67)</td><td align="left" valign="top">6 (22.22)</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">4 (13.33)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">E</td><td align="left" valign="top">2 (6.67)</td><td align="left" valign="top">1 (3.7)</td><td align="left" valign="top">1 (3.33)</td><td align="left" valign="top">1 (3.33)</td></tr><tr><td align="left" valign="top" colspan="6">Manual reordering of chatbot output into alphabetical order</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">A</td><td align="left" valign="top">4 (13.33)</td><td align="left" valign="top">8 (29.63)</td><td align="left" valign="top">8 (26.67)</td><td align="left" valign="top">6 (20)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">B</td><td align="left" valign="top">10 (33.33)</td><td align="left" valign="top">3 (11.11)</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">7 (23.33)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">C</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">5 (18.52)</td><td align="left" valign="top">7 (23.33)</td><td align="left" valign="top">5 (16.67)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">D</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">4 (14.81)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">5 (16.67)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">E</td><td align="left" valign="top">4 (13.33)</td><td align="left" valign="top">7 (25.93)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">7 (23.33)</td></tr></tbody></table></table-wrap><p>Gemini, ChatGPT-3.5, and ChatGPT-4 occasionally provided answer options in alphabetical order when not specifically prompted. Gemini consistently demonstrated a preference for the correct answer to be listed as option B. The ChatGPT-3.5 and ChatGPT-4 appeared to favor options A, B, and C. Bing appeared to favor options A and E.</p><p>Regarding the technical item flaws among the outputs, the chatbots performed similarly in terms of following an SBA format (<xref ref-type="fig" rid="figure2">Figure 2A</xref>) and achieving the &#x201C;cover test&#x201D; satisfaction (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), although ChatGPT-4 scored slightly lower on satisfying the cover test. Overall, Gemini performed well across most items, except for item balance. Notably, Gemini stood out by creating questions with a lead-in that relied heavily on the vignette for the answer (<xref ref-type="fig" rid="figure2">Figure 2F</xref>). Bing scored low across most evaluation items but performed well in generating a lead-in question of appropriate length (<xref ref-type="fig" rid="figure2">Figure 2G</xref>). ChatGPT Plus, which required a paid subscription, did not outperform the other chatbots in any item. The evaluation item &#x201C;questions test the application of knowledge rather than recall of isolated facts&#x201D; received the lowest scores across all the chatbots (<xref ref-type="fig" rid="figure2">Figure 2C</xref>), with Gemini achieving the highest score among them.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Shows technical item flaws among the chatbots: (<bold>A</bold>) single best answer format, (<bold>B</bold>) satisfy the &#x201C;cover test&#x201D; rule, (<bold>C</bold>) test the application of knowledge rather than recall isolated facts, (<bold>D</bold>) questions were balanced, (<bold>E</bold>) lead-in question tests one idea, (<bold>F</bold>) questions depend on the vignette to reach an answer, and (<bold>G</bold>) appropriate lead-in question length. The total number of questions generated by Bing was 27.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v11i1e69521_fig02.png"/></fig></sec><sec id="s3-3"><title>Assessment of Content Validity and Accuracy</title><p>Seven internal assessors evaluated item clarity and relevance, deriving the I-CVI for individual SBA items and the S-CVI (following the Universal Agreement method) to assess the overall content validity for questions from each chatbot (<xref ref-type="table" rid="table3">Table 3</xref>). Items with I-CVI&#x003E;0.79 and scales with S-CVI/UA&#x003E;0.8 can be interpreted as acceptable [<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>Assessors also evaluated items for content clarity and 4 elements of accuracy: vignette information, answers, justifications, and educational accuracy, on a scale from 1 to 4 (<xref ref-type="table" rid="table4">Tables 4</xref> and <xref ref-type="table" rid="table5">5</xref>). The Krippendorff alpha coefficient was low, 0.016, with a 95% bootstrap CI of &#x2212;0.066 to 0.116.</p><p>As depicted in <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>, SBA questions from 3 chatbots (ChatGPT, Gemini, and ChatGPT Plus) had similar content clarity and S-CVI values. In comparison to the other chatbots, Bing performed worst in content clarity, overall (scale) validity, and all elements of item accuracy. ChatGPT Plus, which required a paid subscription, did not outperform the other chatbots except in the measure of educational accuracy. Further statistical analysis was performed using the 2-way ANOVA without replication, which showed statistically significant differences among chatbots and domains (<italic>P</italic>&#x003C;.05). However, the Tukey-Kramer HSD post hoc test revealed no significant pairwise differences between individual chatbots, as all comparisons had <italic>P</italic> values&#x003E;.05 and overlapping CIs. Thus, although the chatbots&#x2019; performance varied overall, specific chatbot differences were not statistically significant.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Item-content validity and scale-content validity across the chatbots.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Item number</td><td align="left" valign="bottom">Gemini</td><td align="left" valign="bottom">Bing</td><td align="left" valign="bottom">ChatGPT-3.5</td><td align="left" valign="bottom">ChatGPT-4</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">I-CVI<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>5</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.83</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>6</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>8</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>9</td><td align="left" valign="top">0.85</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>10</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top">S-CVI/UA<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.9</td><td align="left" valign="top">0.91</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>I-CVI: Item Content Validity Index.</p></fn><fn id="table3fn2"><p><sup>b</sup>Not applicable.</p></fn><fn id="table3fn3"><p><sup>c</sup>S-CVI/UA: Scale Level Content Validity Index.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Average score for content clarity and accuracy of items across the chatbots.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Content clarity<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">Accuracy of information<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">Accuracy of answers<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="bottom">Accuracy of justification<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="bottom">Educational accuracy<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Gemini</td><td align="left" valign="top">3.68</td><td align="left" valign="top">3.71</td><td align="left" valign="top">3.8</td><td align="left" valign="top">3.91</td><td align="left" valign="top">3.49</td></tr><tr><td align="left" valign="top">Bing</td><td align="left" valign="top">3.41</td><td align="left" valign="top">3.3</td><td align="left" valign="top">3.49</td><td align="left" valign="top">3.47</td><td align="left" valign="top">3.2</td></tr><tr><td align="left" valign="top">ChatGPT-3.5</td><td align="left" valign="top">3.75</td><td align="left" valign="top">3.71</td><td align="left" valign="top">3.84</td><td align="left" valign="top">3.9</td><td align="left" valign="top">3.5</td></tr><tr><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">3.71</td><td align="left" valign="top">3.66</td><td align="left" valign="top">3.81</td><td align="left" valign="top">3.82</td><td align="left" valign="top">3.56</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Content clarity refers to the extent to which the question is clearly written, free of ambiguity, and easily understood by the intended audience.</p></fn><fn id="table4fn2"><p><sup>b</sup>Accuracy of information verifies that the facts, concepts, and explanations presented are scientifically and contextually correct.</p></fn><fn id="table4fn3"><p><sup>c</sup>Accuracy of answers ensures that the correct response is indeed accurate, while the distractors remain plausible yet distinguishable.</p></fn><fn id="table4fn4"><p><sup>d</sup>Accuracy of justification evaluates whether the rationale provided for correct and incorrect answers is logically sound, evidence-based, and supports a deeper understanding of the topic.</p></fn><fn id="table4fn5"><p><sup>e</sup>Educational accuracy assesses whether the question is appropriately challenging to the student level, measures higher cognitive levels (such as application or analysis), and adheres to best practices in assessment design.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Two-way ANOVA table.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Source of variation</td><td align="left" valign="bottom">Sum of squares due to the source</td><td align="left" valign="bottom"><italic>df</italic></td><td align="left" valign="bottom">Mean sum of squares due to the source</td><td align="left" valign="bottom"><italic>F</italic> test</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Average content clarity and accuracy scores</td><td align="left" valign="top">0.304357</td><td align="left" valign="top">2</td><td align="left" valign="top">0.152178</td><td align="left" valign="top">24.26587</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Chatbots</td><td align="left" valign="top">17.9744</td><td align="left" valign="top">4</td><td align="left" valign="top">4.493601</td><td align="left" valign="top">716.5349</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Error</td><td align="left" valign="top">0.05017</td><td align="left" valign="top">8</td><td align="left" valign="top">0.006271</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">18.32893</td><td align="left" valign="top">14</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Interpretation of Findings</title><p>This study was designed to examine the content validity and consistency of SBA questions generated by different chatbots in the context of undergraduate medical education. The findings revealed that no single chatbot excelled in all studied domains nor demonstrated a universal superiority over other chatbots, but rather showed unique strengths of some chatbots in specific areas and highlighted their notable limitations in other ones. This emphasizes the importance of critically assessing the output of chatbots in a context-sensitive manner. Bing produced items that were least suitable for inclusion in medical student assessment. These findings echo previous studies, which also show Bing to generate less valid MCQs in comparison to other chatbots [<xref ref-type="bibr" rid="ref25">25</xref>]. ChatGPT-4 showed the greatest variation in responses across users (suggesting higher protection against examination candidates predicting potential assessment items), and had strong performance in content clarity and accuracy, though it also exhibited some less effective question design practices, such as poorer performance in the &#x201C;cover test&#x201D; rule. These findings align with the results of Doughty et al [<xref ref-type="bibr" rid="ref26">26</xref>], who found that GPT-4&#x2019;s ability to generate effective MCQs was nearly on par with human performance, in which 81.7% of the generated MCQs met all evaluation criteria, suggesting that fewer than 1 in 5 questions would need revision by instructors. However, in cases where ChatGPT-4 failed to meet a quality standard, this was typically the only issue with the question. Gemini performed well across all evaluations, matching ChatGPT Plus&#x2019;s strong index score for content validity, and excelled in creating questions where the lead-in tested 1 item and relied heavily on the vignette for the answer. Although slightly behind both ChatGPT versions in content clarity, Gemini scored the highest in providing accurate justifications for the correct answer.</p><p>This variation across chatbots is consistent with results from studies where chatbots were asked to answer questions. Kumari et al [<xref ref-type="bibr" rid="ref15">15</xref>] found significant differences in solving hematology case vignettes using LLMs. ChatGPT achieved the highest score, followed by Google Gemini and then Microsoft Bing. In line with this, Dhanvijay et al [<xref ref-type="bibr" rid="ref13">13</xref>] reported that ChatGPT-3.5 scored the highest, Bing the lowest, and Bard (Gemini) ranked in the middle when solving case vignettes in physiology. When chatbots were tested on their ability to answer SBA questions, ChatGPT-4 and Microsoft Copilot (Bing) outperformed Google Gemini [<xref ref-type="bibr" rid="ref27">27</xref>]. Overall, these results suggest that OpenAI&#x2019;s ChatGPT shows strong potential in the medical education field. However, it is worth noting that none of the models were able to answer all questions correctly, and in our study, all platforms had some flaws when generating SBAs.</p><p>Additionally, this study&#x2019;s results reveal several key insights and revelations concerning SBA questions produced by AI chatbots. First, we observed that chatbots often exhibit a correct answer bias toward particular options. Recent studies have identified that LLMs tend to display positional bias when handling MCQs [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Radford et al [<xref ref-type="bibr" rid="ref30">30</xref>] and Li and Gao [<xref ref-type="bibr" rid="ref31">31</xref>] found that this susceptibility to positional bias is pronounced in the GPT-2 family however a more recent technical report for GPT-4 suggests AI&#x2019;s performance in MCQ remains susceptible to the position of the correct answer among the choices [<xref ref-type="bibr" rid="ref32">32</xref>], a pattern referred to as &#x201C;anchored bias.&#x201D; To minimize this inherent bias that appears to occur across AI platforms, when using AI to generate MCQ or SBA, we would recommend not stipulating an order for answer options in the prompt.</p><p>Furthermore, assessment literature emphasizes that high-quality SBA questions should assess the higher levels of Bloom&#x2019;s taxonomy to encourage students&#x2019; critical thinking and complex problem-solving [<xref ref-type="bibr" rid="ref33">33</xref>]. Our study revealed that chatbots were not always successful in crafting questions that engaged these advanced cognitive levels, and this was an area of relative weakness when evaluating items. Gemini scored highest, followed by ChatGPT Plus, ChatGPT-3.5, and then Bing. Similar findings regarding ChatGPT&#x2019;s limitations were reported by Herrmann-Werner et al [<xref ref-type="bibr" rid="ref34">34</xref>]. Likewise, studies by Klang et al [<xref ref-type="bibr" rid="ref35">35</xref>] and Liu et al [<xref ref-type="bibr" rid="ref36">36</xref>] also emphasized GPT-4&#x2019;s limited ability to integrate knowledge and apply clinical reasoning, highlighting challenges in logical reasoning, which could limit AI&#x2019;s ability to generate questions that test this concept. However, it should be noted that while human-written questions were rated higher in direct comparisons, the score gap was narrow and largely insignificant, suggesting that AI tools still hold potential as educational aids [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Our analysis also revealed some technical flaws, variations, and inconsistencies in item construction within all chatbots. These flaws highlight instances of overconfidence and inadequacies in question design, suggesting an inability of the chatbots to evaluate their output&#x2019;s consistency, relevance, and complexity. Flawed MCQs hinder the accurate and meaningful interpretation of test scores and negatively impact student pass rates. Therefore, identifying and addressing technical flaws in MCQs can enhance their quality and reliability [<xref ref-type="bibr" rid="ref37">37</xref>]. Similarly, Klang et al [<xref ref-type="bibr" rid="ref35">35</xref>] reported that approximately 15% of questions generated using detailed prompts required corrections, primarily due to content inaccuracies or methodological shortcomings. These revisions often involved addressing a lack of sensitivity in certain topics, such as failing to include specific details such age, gender, or geographical context in the questions or answers.</p><p>Most of the questions tested recall and comprehension levels, but Gemini included some that assessed the application of knowledge. In contrast, Bing struggled to generate questions on specific topics. These findings can be explained as critical thinking at higher levels involves considering evidence, context, conceptualization, methods, and the criteria required for judgment [<xref ref-type="bibr" rid="ref38">38</xref>]. AI models are trained on large datasets of text, but they may not fully understand the context or underlying concepts behind the content. Higher-order thinking skills, such as application, analysis, and synthesis, require deeper comprehension and reasoning that AI might not be able to simulate effectively.</p><p>Thus, using AI to generate SBAs encourages us to reconsider Bloom&#x2019;s taxonomy of the cognitive domains [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], which traditionally positions &#x201C;creation&#x201D; as the highest level of cognition. In the era of AI, evaluation might be considered the most critical level of cognition [<xref ref-type="bibr" rid="ref41">41</xref>]. While AI chatbots can often produce well-written questions aligned with LOBs, they still require expert evaluation to ensure their suitability for use. Future research should compare AI-generated outputs with those from subject matter experts to assess accuracy and relevance. Evaluating AI&#x2019;s ability to test higher-order cognition in Bloom&#x2019;s taxonomy is also crucial. As AI evolves, ongoing validation is essential to ensure reliability and effectiveness in assessments.</p><p>Despite the methodological rigor and innovative approach of this study, some limitations need to be highlighted to improve the interpretation of the findings presented here. First, the researchers or assessors generated or evaluated only 30 questions per chatbot. Variation was observed in the content validity and accuracy between the SBAs produced by an individual chatbot. Therefore, this sample may not sufficiently represent the wide range of possible outputs, potentially limiting the generalizability and robustness of the findings. Second, the accuracy of the chatbots&#x2019; responses may have been compromised by the absence of reference materials, which could have negatively affected their performance. Finally, this study is limited by low interrater reliability and the use of measures are not specifically designed to assess MCQ quality. Future research should consider using validated tools to enhance evaluation accuracy.</p></sec><sec id="s4-2"><title>Conclusions</title><p>Chatbot platforms varied in their ability to generate educational questions. ChatGPT models produced the most variable outputs, reducing predictability while maintaining strong content clarity and accuracy with minimal answer bias. Gemini performed similarly but showed a strong preference for 1 option, while Bing had the least variation and the lowest content clarity and accuracy. ChatGPT-4 did not significantly improve question quality but maximized variability. Technical flaws were present across all platforms, with many questions poorly linked to vignettes. Most tested recall and comprehension, though Gemini included some application-level items, whereas Bing struggled with specific topics.</p><p>These findings highlight AI&#x2019;s limitations in generating higher-order thinking questions, reinforcing the need for expert evaluation. This challenges Bloom&#x2019;s taxonomy&#x2019;s traditional cognitive hierarchy, suggesting that &#x201C;evaluation&#x201D; may be more critical than &#x201C;creation&#x201D; in AI-assisted assessments.</p></sec></sec></body><back><ack><p>The authors extend their gratitude to the internal assessors from the School of Medicine, Ulster University, who dedicated their time to evaluating the questions.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">HSD</term><def><p>honestly significant difference</p></def></def-item><def-item><term id="abb3">I-CVI</term><def><p>Item Content Validity Index</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">LOB</term><def><p>learning objective</p></def></def-item><def-item><term id="abb6">MCQ</term><def><p>multiple choice question</p></def></def-item><def-item><term id="abb7">S-CVI</term><def><p>Scale Level Content Validity Index</p></def></def-item><def-item><term id="abb8">SBA</term><def><p>single best answer</p></def></def-item><def-item><term id="abb9">UKMLA</term><def><p>United Kingdom Medical Licensing Assessment</p></def></def-item><def-item><term id="abb10">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pugh</surname><given-names>D</given-names> </name><name name-style="western"><surname>De Champlain</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gierl</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>H</given-names> </name><name name-style="western"><surname>Touchie</surname><given-names>C</given-names> </name></person-group><article-title>Using cognitive models to develop quality multiple-choice questions</article-title><source>Med Teach</source><year>2016</year><month>08</month><volume>38</volume><issue>8</issue><fpage>838</fpage><lpage>843</lpage><pub-id pub-id-type="doi">10.3109/0142159X.2016.1150989</pub-id><pub-id pub-id-type="medline">26998566</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheung</surname><given-names>BHH</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>GKK</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>GTC</given-names> </name><etal/></person-group><article-title>ChatGPT versus human in generating medical graduate exam multiple choice questions-a multinational prospective study (Hong Kong S.A.R., Singapore, Ireland, and the United Kingdom)</article-title><source>PLoS One</source><year>2023</year><volume>18</volume><issue>8</issue><fpage>e0290691</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0290691</pub-id><pub-id pub-id-type="medline">37643186</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rodriguez-Torrealba</surname><given-names>R</given-names> </name><name name-style="western"><surname>Garcia-Lopez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Garcia-Cabot</surname><given-names>A</given-names> </name></person-group><article-title>End-to-end generation of multiple-choice questions using text-to-text transfer transformer models</article-title><source>Expert Syst Appl</source><year>2022</year><month>12</month><volume>208</volume><fpage>118258</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2022.118258</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Turing</surname><given-names>AM</given-names> </name></person-group><article-title>I.&#x2014;computing machinery and intelligence</article-title><source>Mind</source><year>1950</year><month>10</month><day>1</day><volume>LIX</volume><issue>236</issue><fpage>433</fpage><lpage>460</lpage><pub-id pub-id-type="doi">10.1093/mind/LIX.236.433</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudolph</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>S</given-names> </name></person-group><article-title>ChatGPT: bullshit spewer or the end of traditional assessments in higher education?</article-title><source>JALT</source><year>2023</year><access-date>2025-05-14</access-date><volume>6</volume><issue>1</issue><fpage>342</fpage><lpage>363</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://journals.sfu.ca/jalt/index.php/jalt/issue/view/31">https://journals.sfu.ca/jalt/index.php/jalt/issue/view/31</ext-link></comment><pub-id pub-id-type="doi">10.37074/jalt.2023.6.1.9</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudolph</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>S</given-names> </name></person-group><article-title>War of the chatbots: Bard, Bing Chat, ChatGPT, Ernie and beyond. The new AI gold rush and its impact on higher education</article-title><source>JALT</source><year>2023</year><access-date>2025-05-14</access-date><volume>6</volume><issue>1</issue><fpage>364</fpage><lpage>389</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://journals.sfu.ca/jalt/index.php/jalt/issue/view/31">https://journals.sfu.ca/jalt/index.php/jalt/issue/view/31</ext-link></comment><pub-id pub-id-type="doi">10.37074/jalt.2023.6.1.23</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giannakopoulos</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kavadella</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stamatopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kaklamanos</surname><given-names>EG</given-names> </name><name name-style="western"><surname>Salim</surname><given-names>AA</given-names> </name></person-group><article-title>Evaluation of the performance of generative AI large language models ChatGPT, Google Bard, and Microsoft Bing chat in supporting evidence-based dentistry: comparative mixed methods study</article-title><source>J Med Internet Res</source><year>2023</year><month>12</month><day>28</day><volume>25</volume><fpage>e51580</fpage><pub-id pub-id-type="doi">10.2196/51580</pub-id><pub-id pub-id-type="medline">38009003</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morjaria</surname><given-names>L</given-names> </name><name name-style="western"><surname>Burns</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bracken</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Examining the threat of ChatGPT to the validity of short answer assessments in an undergraduate medical program</article-title><source>J Med Educ Curric Dev</source><year>2023</year><volume>10</volume><fpage>23821205231204178</fpage><pub-id pub-id-type="doi">10.1177/23821205231204178</pub-id><pub-id pub-id-type="medline">37780034</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lai</surname><given-names>UH</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Hsu</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Kan</surname><given-names>JKC</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT-4 on the United Kingdom Medical Licensing Assessment</article-title><source>Front Med (Lausanne)</source><year>2023</year><volume>10</volume><fpage>1240915</fpage><pub-id pub-id-type="doi">10.3389/fmed.2023.1240915</pub-id><pub-id pub-id-type="medline">37795422</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Connor</surname><given-names>S</given-names> </name></person-group><article-title>Open artificial intelligence platforms in nursing education: tools for academic progress or abuse?</article-title><source>Nurse Educ Pract</source><year>2023</year><month>01</month><volume>66</volume><fpage>103537</fpage><pub-id pub-id-type="doi">10.1016/j.nepr.2022.103537</pub-id><pub-id pub-id-type="medline">36549229</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Sample</surname><given-names>I</given-names> </name></person-group><article-title>Science journals ban listing of ChatGPT as co-author on papers</article-title><source>The Guardian</source><year>2023</year><access-date>2025-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.theguardian.com/science/2023/jan/26/science-journals-ban-listing-of-chatgpt-as-co-author-on-papers">https://www.theguardian.com/science/2023/jan/26/science-journals-ban-listing-of-chatgpt-as-co-author-on-papers</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dhanvijay</surname><given-names>AKD</given-names> </name><name name-style="western"><surname>Pinjar</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Dhokane</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sorte</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Kumari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>H</given-names> </name></person-group><article-title>Performance of large language models (ChatGPT, Bing Search, and Google Bard) in solving case vignettes in physiology</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e42972</fpage><pub-id pub-id-type="doi">10.7759/cureus.42972</pub-id><pub-id pub-id-type="medline">37671207</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koga</surname><given-names>S</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>NB</given-names> </name><name name-style="western"><surname>Dickson</surname><given-names>DW</given-names> </name></person-group><article-title>Evaluating the performance of large language models: ChatGPT and Google Bard in generating differential diagnoses in clinicopathological conferences of neurodegenerative disorders</article-title><source>Brain Pathol</source><year>2024</year><month>05</month><volume>34</volume><issue>3</issue><fpage>e13207</fpage><pub-id pub-id-type="doi">10.1111/bpa.13207</pub-id><pub-id pub-id-type="medline">37553205</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kumari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Large language models in hematology case solving: a comparative study of ChatGPT-3.5, Google Bard, and Microsoft Bing</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e43861</fpage><pub-id pub-id-type="doi">10.7759/cureus.43861</pub-id><pub-id pub-id-type="medline">37736448</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zuckerman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Flood</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>RJB</given-names> </name><etal/></person-group><article-title>ChatGPT for assessment writing</article-title><source>Med Teach</source><year>2023</year><month>11</month><volume>45</volume><issue>11</issue><fpage>1224</fpage><lpage>1227</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2023.2249239</pub-id><pub-id pub-id-type="medline">37789636</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>K&#x0131;yak</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Co&#x015F;kun</surname><given-names>&#x00D6;</given-names> </name><name name-style="western"><surname>Budako&#x011F;lu</surname><given-names>I&#x0130;</given-names> </name><name name-style="western"><surname>Uluo&#x011F;lu</surname><given-names>C</given-names> </name></person-group><article-title>ChatGPT for generating multiple-choice questions: evidence on the use of artificial intelligence in automatic item generation for a rational pharmacotherapy exam</article-title><source>Eur J Clin Pharmacol</source><year>2024</year><month>05</month><volume>80</volume><issue>5</issue><fpage>729</fpage><lpage>735</lpage><pub-id pub-id-type="doi">10.1007/s00228-024-03649-x</pub-id><pub-id pub-id-type="medline">38353690</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mistry</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Saeed</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rafique</surname><given-names>S</given-names> </name><name name-style="western"><surname>Le</surname><given-names>T</given-names> </name><name name-style="western"><surname>Obaid</surname><given-names>H</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>SJ</given-names> </name></person-group><article-title>Large language models as tools to generate radiology board-style multiple-choice questions</article-title><source>Acad Radiol</source><year>2024</year><month>09</month><volume>31</volume><issue>9</issue><fpage>3872</fpage><lpage>3878</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2024.06.046</pub-id><pub-id pub-id-type="medline">39013736</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>LT</given-names> </name><name name-style="western"><surname>McAleer</surname><given-names>JJA</given-names> </name><collab>Final FRCR Examination Board</collab></person-group><article-title>The introduction of single best answer questions as a test of knowledge in the final examination for the fellowship of the Royal College of Radiologists in Clinical Oncology</article-title><source>Clin Oncol (R Coll Radiol)</source><year>2008</year><month>10</month><volume>20</volume><issue>8</issue><fpage>571</fpage><lpage>576</lpage><pub-id pub-id-type="doi">10.1016/j.clon.2008.05.010</pub-id><pub-id pub-id-type="medline">18585017</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Case</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Swanson</surname><given-names>DB</given-names> </name></person-group><article-title>Writing one-best-answer questions for the basic and clinical sciences</article-title><source>Constructing Written Test Questions for the Basic and Clinical Sciences</source><year>2016</year><publisher-name>National Board of Medical Examiners</publisher-name><fpage>31</fpage><lpage>66</lpage></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yusoff</surname><given-names>MSB</given-names> </name></person-group><article-title>ABC of content validation and content validity index calculation</article-title><source>EIMJ</source><year>2019</year><access-date>2025-05-14</access-date><volume>11</volume><issue>2</issue><fpage>49</fpage><lpage>54</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://eduimed.usm.my/EIMJvol11no2.html">https://eduimed.usm.my/EIMJvol11no2.html</ext-link></comment><pub-id pub-id-type="doi">10.21315/eimj2019.11.2.6</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cook</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Beckman</surname><given-names>TJ</given-names> </name></person-group><article-title>Current concepts in validity and reliability for psychometric instruments: theory and application</article-title><source>Am J Med</source><year>2006</year><month>02</month><volume>119</volume><issue>2</issue><fpage>166</fpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2005.10.036</pub-id><pub-id pub-id-type="medline">16443422</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Krippendorff</surname><given-names>K</given-names> </name></person-group><source>Content Analysis: An Introduction to Its Methodology</source><year>2019</year><edition>4</edition><publisher-name>SAGE Publications</publisher-name><pub-id pub-id-type="doi">10.4135/9781071878781</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marzi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Balzano</surname><given-names>M</given-names> </name><name name-style="western"><surname>Marchiori</surname><given-names>D</given-names> </name></person-group><article-title>K-Alpha Calculator-Krippendorff&#x2019;s Alpha Calculator: a user-friendly tool for computing Krippendorff&#x2019;s Alpha inter-rater reliability coefficient</article-title><source>MethodsX</source><year>2024</year><month>06</month><volume>12</volume><fpage>102545</fpage><pub-id pub-id-type="doi">10.1016/j.mex.2023.102545</pub-id><pub-id pub-id-type="medline">39669968</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Agarwal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>P</given-names> </name><name name-style="western"><surname>Goswami</surname><given-names>A</given-names> </name></person-group><article-title>Analysing the applicability of ChatGPT, Bard, and Bing to generate reasoning-based multiple-choice questions in medical physiology</article-title><source>Cureus</source><year>2023</year><month>06</month><volume>15</volume><issue>6</issue><fpage>e40977</fpage><pub-id pub-id-type="doi">10.7759/cureus.40977</pub-id><pub-id pub-id-type="medline">37519497</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Doughty</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bompelli</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A comparative study of AI-generated (GPT-4) and human-crafted MCQs in programming education</article-title><access-date>2025-05-14</access-date><conf-name>ACE 2024</conf-name><conf-date>Jan 29 to Feb 2, 2024</conf-date><conf-loc>Sydney, New South Wales, Australia</conf-loc><fpage>114</fpage><lpage>123</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3636243">https://dl.acm.org/doi/proceedings/10.1145/3636243</ext-link></comment><pub-id pub-id-type="doi">10.1145/3636243.3636256</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rossettini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Rodeghiero</surname><given-names>L</given-names> </name><name name-style="western"><surname>Corradi</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Comparative accuracy of ChatGPT-4, Microsoft Copilot and Google Gemini in the Italian entrance test for healthcare sciences degrees: a cross-sectional study</article-title><source>BMC Med Educ</source><year>2024</year><month>06</month><day>26</day><volume>24</volume><issue>1</issue><fpage>694</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05630-9</pub-id><pub-id pub-id-type="medline">38926809</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Pezeshkpour</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hruschka</surname><given-names>E</given-names> </name></person-group><article-title>Positional bias in large language models when handling multiple-choice questions</article-title><source>arXiv</source><access-date>2025-05-14</access-date><comment>Preprint posted online on  Aug 22, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2308.11483">https://arxiv.org/abs/2308.11483</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.11483</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name></person-group><article-title>Investigating option position biases in large language models</article-title><source>arXiv</source><access-date>2025-05-14</access-date><comment>Preprint posted online on  Sep 7, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2309.03882">https://arxiv.org/abs/2309.03882</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.03882</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Child</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Language models are unsupervised multitask learners</article-title><source>OpenAI Blog</source><year>2019</year><access-date>2025-05-17</access-date><volume>1</volume><issue>8</issue><fpage>9</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://api.semanticscholar.org/CorpusID:160025533">https://api.semanticscholar.org/CorpusID:160025533</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name></person-group><article-title>Anchored answers: unravelling positional bias in GPT-2&#x2019;s multiple-choice questions</article-title><source>arXiv</source><access-date>2025-05-14</access-date><comment>Preprint posted online on  May 6, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2405.03205">https://arxiv.org/abs/2405.03205</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.03205</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>L</given-names> </name><name name-style="western"><surname>Akkaya</surname><given-names>I</given-names> </name><name name-style="western"><surname>Aleman</surname><given-names>FL</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><access-date>2025-05-14</access-date><comment>Preprint posted online on  Mar 15, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2303.08774">https://arxiv.org/abs/2303.08774</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walsh</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>BHL</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>PE</given-names> </name></person-group><article-title>Single best answer question-writing tips for clinicians</article-title><source>Postgrad Med J</source><year>2017</year><month>02</month><day>1</day><volume>93</volume><issue>1096</issue><fpage>76</fpage><lpage>81</lpage><pub-id pub-id-type="doi">10.1136/postgradmedj-2015-133893</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herrmann-Werner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Festl-Wietek</surname><given-names>T</given-names> </name><name name-style="western"><surname>Holderried</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Assessing ChatGPT&#x2019;s mastery of Bloom&#x2019;s taxonomy using psychosomatic medicine exam questions: mixed-methods study</article-title><source>J Med Internet Res</source><year>2024</year><month>01</month><day>23</day><volume>26</volume><fpage>e52113</fpage><pub-id pub-id-type="doi">10.2196/52113</pub-id><pub-id pub-id-type="medline">38261378</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Portugez</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gross</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Advantages and pitfalls in utilizing artificial intelligence for crafting medical examinations: a medical education pilot study with GPT-4</article-title><source>BMC Med Educ</source><year>2023</year><month>10</month><day>17</day><volume>23</volume><issue>1</issue><fpage>772</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04752-w</pub-id><pub-id pub-id-type="medline">37848913</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ning</surname><given-names>R</given-names> </name><name name-style="western"><surname>Teng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>Evaluating the logical reasoning ability of ChatGPT and GPT-4</article-title><source>arXiv</source><access-date>2025-05-14</access-date><comment>Preprint posted online on  Apr 7, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2304.03439">https://arxiv.org/abs/2304.03439</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.03439</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>HF</given-names> </name><name name-style="western"><surname>Danish</surname><given-names>KF</given-names> </name><name name-style="western"><surname>Awan</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Anwar</surname><given-names>M</given-names> </name></person-group><article-title>Identification of technical item flaws leads to improvement of the quality of single best multiple choice questions</article-title><source>Pak J Med Sci</source><year>2013</year><month>05</month><volume>29</volume><issue>3</issue><fpage>715</fpage><lpage>718</lpage><pub-id pub-id-type="doi">10.12669/pjms.293.2993</pub-id><pub-id pub-id-type="medline">24353614</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dergaa</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chamari</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zmijewski</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ben Saad</surname><given-names>H</given-names> </name></person-group><article-title>From human writing to artificial intelligence generated text: examining the prospects and potential threats of ChatGPT in academic writing</article-title><source>Biol Sport</source><year>2023</year><month>04</month><volume>40</volume><issue>2</issue><fpage>615</fpage><lpage>622</lpage><pub-id pub-id-type="doi">10.5114/biolsport.2023.125623</pub-id><pub-id pub-id-type="medline">37077800</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krathwohl</surname><given-names>DR</given-names> </name></person-group><article-title>A revision of Bloom&#x2019;s taxonomy: an overview</article-title><source>Theory Pract</source><year>2002</year><month>11</month><day>1</day><volume>41</volume><issue>4</issue><fpage>212</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1207/s15430421tip4104_2</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tutkun</surname><given-names>OF</given-names> </name><name name-style="western"><surname>G&#x00FC;zel</surname><given-names>G</given-names> </name><name name-style="western"><surname>K&#x00F6;ro&#x011F;lu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ilhan</surname><given-names>H</given-names> </name></person-group><article-title>Bloom&#x2019;s revised taxonomy and critics on it</article-title><source>Online J Couns Educ</source><year>2012</year><volume>1</volume><issue>3</issue><fpage>23</fpage><lpage>30</lpage></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Scheuer-Larsen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lauridsen</surname><given-names>PS</given-names> </name></person-group><article-title>Bloom's taxonomy in the interaction between artificial intelligence and human learning</article-title><source>Viden.AI</source><access-date>2025-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://viden.ai/en/blooms-taxonomy-and-ai">https://viden.ai/en/blooms-taxonomy-and-ai</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Further data on the assessment of questions generated</p><media xlink:href="mededu_v11i1e69521_app1.xlsx" xlink:title="XLSX File, 35 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Questions generated.</p><media xlink:href="mededu_v11i1e69521_app2.xlsx" xlink:title="XLSX File, 85 KB"/></supplementary-material></app-group></back></article>