<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e82702</article-id><article-id pub-id-type="doi">10.2196/82702</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Ambiguity Detection in Medical Exams via Large Language Models: Retrospective Cross-Sectional Pilot Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lombardi</surname><given-names>Romain</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Destere</surname><given-names>Alexandre</given-names></name><degrees>PharmD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dellamonica</surname><given-names>Jean</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>G&#x00E9;rard</surname><given-names>Alexandre O</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jozwiak</surname><given-names>Mathieu</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Critical Care Unit, Pasteur 2 University Hospital</institution><addr-line>30 Voie Romaine</addr-line><addr-line>Nice</addr-line><country>France</country></aff><aff id="aff2"><institution>UR2CA, Unit&#x00E9; de Recherche Clinique C&#x00F4;te d'Azur, Universit&#x00E9; C&#x00F4;te d'Azur</institution><addr-line>Nice</addr-line><country>France</country></aff><aff id="aff3"><institution>Department of Clinical Pharmacology and Pharmacovigilance Center, Medical Centre, Universit&#x00E9; C&#x00F4;te d'Azur</institution><addr-line>Nice</addr-line><country>France</country></aff><aff id="aff4"><institution>Maasai Team, Laboratoire J.A. Dieudonn&#x00E9;, Universit&#x00E9; C&#x00F4;te d'Azur, Inria, CNRS</institution><addr-line>Nice</addr-line><country>France</country></aff><aff id="aff5"><institution>Critical Care Unit, Archet 1 University Hospital</institution><addr-line>Nice</addr-line><country>France</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Pellegrino</surname><given-names>Raffaele</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bai</surname><given-names>Enze</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Cracowski</surname><given-names>Jean-Luc</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Haudebourg</surname><given-names>Luc</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Romain Lombardi, MSc, MD, Critical Care Unit, Pasteur 2 University Hospital, 30 Voie Romaine, Nice, 06100, France, 33 0669032616; <email>lombardi.r@chu-nice.fr</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>5</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e82702</elocation-id><history><date date-type="received"><day>20</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>12</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>20</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Romain Lombardi, Alexandre Destere, Jean Dellamonica, Alexandre O G&#x00E9;rard, Mathieu Jozwiak. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 26.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2026/1/e82702"/><abstract><sec><title>Background</title><p>Large language models (LLMs) have emerged as promising tools in medical education due to their ability to understand, generate, and reason with natural language. Their ability to simulate expert reasoning suggests a potential for supporting quality control in assessment design. In this study, the use of LLMs in identifying ambiguous or poorly constructed exam items in critical care academic assessments was evaluated.</p></sec><sec><title>Objective</title><p>The study aimed to develop automated ambiguity and quality scores to objectively assess individual questions and entire exam components.</p></sec><sec sec-type="methods"><title>Methods</title><p>We analyzed 264 questions from academic exams conducted over 3 academic years (2023-2025) at the Medical School of Universit&#x00E9; C&#x00F4;te d&#x2019;Azur. Questions were drawn from 4 docimological formats: progressive clinical cases (PCC), mini-PCC, key feature problems, and isolated question sequences (IQS). Each element was submitted to 4 LLMs (ChatGPT, Gemini Pro, Le Chat, and DeepSeek) without prompt engineering. Performance was evaluated using the official correction key. We applied 4 binary diagnostic tags based on model agreement and self-reported ambiguity: ambiguity, low performance, incoherence, and subjective ambiguity. These tags generated a composite ambiguity score and contributed to a weighted quality score for each exam component.</p></sec><sec sec-type="results"><title>Results</title><p>LLMs achieved mean scores in the same range as students, with no significant differences across academic years and significantly higher performance on the mini-PCC and IQS formats (<italic>P</italic>=.049 and <italic>P</italic>=.04, respectively). IQS items had the highest ambiguity scores (54 items received a score of 2 in both 2023 and 2024, and 53 items retained the same score). Tag patterns revealed frequent issues with ambiguity and inconsistency. Quality scores varied across academic years. IQS predominantly showed moderate ambiguity (score 2), with occasional instances of strong signals. There was no significant difference in quality based on author specialty or seniority (<italic>P</italic>=.08 and <italic>P</italic>=.44, respectively).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In this pilot study, LLMs may offer a preliminary framework to proactively detect ambiguous exam questions and estimate the overall quality of an exam. Integrating these tools into the assessment design process could potentially reduce the need for postexam corrections and may help improve fairness and clarity in medical evaluations.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>LLM</kwd><kwd>medical education</kwd><kwd>exams</kwd><kwd>ambiguity detection</kwd><kwd>automated scoring</kwd><kwd>quality assessment</kwd><kwd>docimology</kwd><kwd>artificial intelligence</kwd><kwd>critical care</kwd><kwd>emergency</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) are a class of artificial intelligence (AI) systems based on neural network architectures designed for natural language understanding and generation [<xref ref-type="bibr" rid="ref1">1</xref>]. To function efficiently and deliver robust performance, LLMs are trained on massive amounts of textual data. They are designed to identify and learn recurrent patterns embedded in natural language [<xref ref-type="bibr" rid="ref2">2</xref>]. LLMs have become an integral part of daily applications. Chatbot platforms such as ChatGPT, Google Gemini, DeepSeek, and Mistral are leading examples of the widespread use of natural language interactions [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. LLMs are also extensively applied in machine translation, with systems like DeepL and Google Translate providing accurate and context-aware translations [<xref ref-type="bibr" rid="ref8">8</xref>]. Furthermore, these models play a growing role in education by generating automated exercises (eg, Socratic) and in software development by providing intelligent code assistance (eg, GitHub Copilot) [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>LLMs have already been studied in a wide range of medical applications across various disciplines, and they appear to have a promising future [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Numerous studies in the fields of intensive care and perioperative medicine have investigated the various applications of LLMs. For example, Chung et al [<xref ref-type="bibr" rid="ref10">10</xref>] demonstrated how these models can be used for perioperative risk stratification. These models can also be used to generate databases from the daily unstructured medical data in the medical sheets of critically ill patients [<xref ref-type="bibr" rid="ref11">11</xref>]. LLMs can also serve as valuable tools for triaging patients and prioritizing those requiring emergency department care based on severity [<xref ref-type="bibr" rid="ref12">12</xref>]. In more unconventional applications, their use has been explored for generating recommendations on the most relevant clinical topics. However, the results remain mixed and require expert oversight [<xref ref-type="bibr" rid="ref13">13</xref>]. Their increasing use in clinical reasoning necessitates both evaluation and rigorous testing to ensure their validity.</p><p>LLMs are increasingly being explored as important tools in the fields of pedagogy and medical education [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. These models can be used to train anesthesia residents by generating relevant information and synthesizing content from multiple sources [<xref ref-type="bibr" rid="ref16">16</xref>]. They can assist with clinical diagnosis, which is particularly valuable for students in the early stages of their medical education [<xref ref-type="bibr" rid="ref17">17</xref>]. One of the major limitations of clinical examination simulation is the limited availability of standardized patients. ChatGPT may offer a potential solution by serving as a substitute for human actors [<xref ref-type="bibr" rid="ref18">18</xref>]. AI could enhance not only medical students&#x2019; learning but also the evaluation and quality assurance of medical assessments. Maitland et al [<xref ref-type="bibr" rid="ref19">19</xref>] attempted to have LLMs sit the Membership of the Royal Colleges of Physicians written examinations in the United Kingdom. The results showed that LLMs outperformed human candidates. The fine-tuned models also enabled the identification of 8 types of errors, such as factual errors, context errors, and omission errors. This offers a potential way to improve the quality of exam questions. In a preliminary study, G&#x00E9;rard et al [<xref ref-type="bibr" rid="ref20">20</xref>] evaluated local and European pharmacology exams, highlighting how LLMs can help in identifying ambiguous questions.</p><p>The primary objective of this study was to develop a methodology for detecting ambiguous or low-quality questions in academic exams using various LLMs.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Objectives</title><p>We used LLMs as expert reasoning agents to simulate medical students&#x2019; behavior and identify exam questions that might be poorly constructed or ambiguous. The secondary objectives were to develop an ambiguity scoring system and a quality score at the case level (ie, aggregating multiple questions), and to analyze quality variations according to academic year, assessment format, and the author&#x2019;s academic title.</p></sec><sec id="s2-2"><title>Data Sources and Study Population</title><p>We analyzed academic exams taken by medical students at the Universit&#x00E9; C&#x00F4;te d&#x2019;Azur. Only questions from the &#x201C;Emergency and Intensive Care&#x201D; course unit were included in this study. The questions were written either by an intensivist or an emergency physician, and the author was either a clinical fellow, an associate professor, or a full professor. A total of 3 academic exam cohorts were analyzed: 2023, 2024, and 2025. An exam was composed of several docimological components, and each component consisted of multiple questions, such as multiple-choice questions (MCQs), short-answer questions (SAQs), extended matching questions, or single-best-answer questions. The docimological components consist of four types: (1) progressive clinical cases (PCC; extended clinical scenarios with sequenced MCQs), (2) key feature problem (KFP; short critical decision-making vignettes), (3) mini-PCC (mPCC; condensed clinical cases), and (4) isolated question sequence (IQS; standalone items [MCQs or SAQs]) [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>The year 2025 corresponded to the evaluation of master&#x2019;s level 2 (M2) students, while the years 2024 and 2023 corresponded to evaluations of both master&#x2019;s level 1 (M1) and M2 students. Additionally, the 2023 cohort also included evaluations of master&#x2019;s level 3 (M3) students. Each docimological component varied across the academic years.</p><p>Each exam consisted of between 3 and 8 different docimological components. A total of 264 questions were analyzed. All the exam scores were normalized to a 20-point scale. Partial credit was awarded based on the number of discrepancies between the participant&#x2019;s response and the official answer key, specifically false positives or incorrect options omitted. Participants received a score of 0.5 for 1 discrepancy, 0.2 for 2 discrepancies, and no points if more than 2 discrepancies were identified. A minimum score of 10 out of 20 was required to pass the exam.</p><p>The official correction and scoring rubric, the global performance scores of students per question and per docimological component, the responses and performance scores from the LLMs, and manual annotations from a human investigator were available for all questions.</p><p>To ensure consistency and minimize bias, all questions were carefully standardized in their format and presentation. No alterations or prompt engineering were applied to either the questions or answer options, allowing for an impartial assessment of the LLMs&#x2019; inherent capabilities. The exam questions, extracted from PDF files, were submitted to each LLM, which was asked to choose the appropriate answer relying solely on its prior knowledge and training.</p></sec><sec id="s2-3"><title>LLMs Evaluation Procedure and Ambiguity Detection</title><p>We analyzed the results of the 4 most prominent LLMs: ChatGPT-4, Gemini 2.5 Pro, Mistral Le Chat, and DeepSeek R1. Each exam question was submitted to the LLMs in two formats: (1) full-context input, simulating access to the complete clinical case; and (2) sequential-input, where questions were revealed one by one, mimicking an exam scenario. We deliberately adopted a zero-shot prompting strategy without task-specific parameter tuning or extensive prompt engineering. The rationale was to simulate a real-world usage scenario where educators or examiners interact with LLMs as &#x201C;out-of-the-box&#x201D; tools without requiring specialized expertise in AI engineering, and to assess how the intrinsic phrasing of each test item naturally triggers consistency or divergence across different model architectures, minimizing the risk of over-tuning the models.</p><p>LLM responses were graded manually by the investigator according to the official correction, using the same scale applied to student answers. We also asked each LLM to identify any ambiguity in the question based on its own defined criteria. From these scores, three metrics were computed per question: the mean score of the LLMs, the SD of the LLMs&#x2019; scores, and the number of LLMs that failed to answer correctly. Of note, no significant differences were observed between the 2 input formats.</p><p>We implemented a tagging system to assess the quality and clarity of each question. This system assigned binary diagnostic tags based on LLM performance patterns. These tags were designed to identify specific aspects of flawed question design and were applied automatically to each question to minimize the impact of subjective human bias. The tags used are as follows:</p><list list-type="bullet"><list-item><p>Ambiguity tag: assigned when multiple LLMs provide incorrect answers with a diversity of distinct wrong responses, indicating disagreement in correctness, reasoning, or interpretation of the question. The tag was set to 1 if at least 2 LLMs answered incorrectly with at least 2 different incorrect answers among them.</p></list-item><list-item><p>Low performance tag: this tag indicates that the question is challenging or potentially flawed due to universally poor performance by the LLMs. The tag is assigned if the mean normalized score of all LLMs is below 0.5, combined with low variance among their scores (SD&#x003C;0.3). The low performance tag aims to identify items that are potentially &#x201C;unanswerable&#x201D; or structurally flawed. While difficulty and ambiguity are distinct, a universal failure of multiple expert-level LLMs with low variance often points toward a lack of clarity or a misleading structure rather than intended pedagogical selectivity.</p></list-item><list-item><p>Incoherence tag: captured inconsistency in LLMs&#x2019; understanding of the question. The tag was assigned when the measure of dispersion in LLM scores exceeded 0.3 (SD&#x003E;0.3).</p></list-item><list-item><p>Subjective ambiguity detection is assigned when the LLMs themselves identify ambiguity in the question based on evaluation criteria that they have autonomously established. The tag was set to 1 if at least 2 LLMs reported the presence of ambiguity according to their self-defined standards.</p></list-item></list><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes the different tags and their assignment criteria.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of definitions and criteria for tag attribution.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Tag names</td><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Operational criteria</td><td align="left" valign="bottom">Rationale</td></tr></thead><tbody><tr><td align="left" valign="top">Ambiguity tag</td><td align="left" valign="top">Multiple LLMs<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> provide incorrect answers with a diversity of wrong responses.</td><td align="left" valign="top">Tag=1 if at least 2 LLMs answer incorrectly and if at least 2 different answers</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Reducing false positives arising from multiple LLMs converging on the same wrong answer reflects a shared misconception rather than true ambiguity.</p></list-item><list-item><p>The diversity of wrong answers suggests multiple plausible interpretations or poorly defined answer keys.</p></list-item></list></td></tr><tr><td align="left" valign="top">Low performance tag</td><td align="left" valign="top">The question is challenging or potentially flawed due to universally poor performance by the LLMs</td><td align="left" valign="top">Tag=1 if the mean normalized score of all LLMs is below 0.5 and SD&#x003C;0.3</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Combined criterion distinguished truly difficult but valid questions from ambiguous or flawed questions (displaying greater variability or discrepancy between LLMs and student results).</p></list-item></list></td></tr><tr><td align="left" valign="top">Incoherence tag</td><td align="left" valign="top">Captures inconsistency in LLMs&#x2019; understanding of the question</td><td align="left" valign="top">Tag=1 if SD&#x003E;0.3</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Prevents outliers from skewing variability measures and ensures that incoherence reflects genuine divergence in interpretation rather than isolated anomalies.</p></list-item></list></td></tr><tr><td align="left" valign="top">Subjective ambiguity detection tag</td><td align="left" valign="top">Assigned when LLMs identify ambiguity based on their own evaluation criteria</td><td align="left" valign="top">Tag=1 if at least 2 LLMs report the presence of ambiguity</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>This approach leverages the LLMs&#x2019; capacity for self-assessment. Requiring agreement from multiple models helps ensure that the detected ambiguity is not idiosyncratic to a single LLM.</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><p>Each question was automatically tagged with the 4 binary flags described earlier. The sum of these tags formed a composite ambiguity score ranging from 0 to 3 (ambiguity score = ambiguity tag + low performance tag + incoherence tag + subjective ambiguity detection tag).</p><p>A score of 0 indicated &#x201C;No issues detected,&#x201D; a score of 1 indicated &#x201C;Minor concern,&#x201D; a score of 2 indicated &#x201C;Moderate ambiguity,&#x201D; and a score of 4 indicated &#x201C;Strong signal of item flaw or misleading structure.&#x201D; The low performance and incoherence tags are mutually exclusive, as they depend on opposing SD thresholds.</p><p>The thresholds for flagging potentially problematic questions were systematically determined using a combination of visual inspection and data-driven analysis, specifically elbow-point detection, to ensure that tag assignments were objectively grounded in the performance distribution. The elbow-point detection technique identifies the inflection point in the sorted distribution of each metric, representing the transition from typical to anomalous values.</p></sec><sec id="s2-4"><title>Quality Assessment for Docimological Component</title><p>To calculate the quality score for each docimological component, we first determined the average score obtained by the LLMs for each question. We then applied a weighting based on the ambiguity score of each question according to the following formula:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>=</mml:mtext><mml:mn>20</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>Q<sub>i</sub></italic> is the quality score for question <italic>i</italic> (on a 20-point scale), and <italic>S<sub>i</sub></italic>, ranging from 0 to 1, is the average score of LLMs for question <italic>i</italic>.</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>S</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mtext>=</mml:mtext><mml:mfrac><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mtext>=</mml:mtext><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mtext>=</mml:mtext><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where N is the total number of questions, <italic>Q<sub>i</sub></italic> is the quality score for question <italic>i</italic> (on a 20-point scale), and <italic>w<sub>i</sub></italic> is the weight assigned to question <italic>i</italic> with:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>=</mml:mtext><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mn>1</mml:mn><mml:mtext>+</mml:mtext><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>w<sub>i</sub></italic> is the weight assigned to question <italic>i</italic>, and <italic>A<sub>i</sub></italic> is the ambiguity score for the question <italic>i</italic>.</p><p>Using this system, we were able to assign a qualitative description to each docimological component based on the score obtained, as follows:</p><list list-type="bullet"><list-item><p>If score &#x2208; [18-20]: excellent, clear, precise, and unambiguous writing.</p></list-item><list-item><p>If score &#x2208; [15-18]: very good, well-structured writing with minor ambiguities.</p></list-item><list-item><p>If score &#x2208; [12-15]: moderate, understandable writing, but with recurring ambiguities.</p></list-item><list-item><p>If score &#x2208; [10-12]: poor, confused writing with repeated ambiguities.</p></list-item><list-item><p>If score &#x2208; [0-10]: insufficient, vague, and ambiguous writing impacts reliability.</p></list-item></list><p>Components with a quality score below 15 were re-evaluated by the authors to clarify the identified ambiguities.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>The results obtained by the students were expressed in terms of mean, median, and maximum scores, whereas the LLMs&#x2019; results were presented as mean only. We used the Mann-Whitney <italic>U</italic> test to compare the performance of LLMs and students, with the results expressed in terms of <italic>P</italic> values. We compared the quality score results according to the writer&#x2019;s medical specialty, professional career level (clinical fellow, associate professor, or full professor), and the type of docimological component. A Student 2-tailed <italic>t</italic> test was used when the distribution was normal, as assessed by the Shapiro-Wilk test; otherwise, a Mann-Whitney <italic>U</italic> test was applied. We considered a <italic>P</italic> value less than .05 to be statistically significant.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>All materials analyzed were anonymized. No individual student responses were accessed. LLMs were used only to simulate reasoning without personalization. According to French law (Articles L1121-1 et seq. of the French Public Health Code) [<xref ref-type="bibr" rid="ref22">22</xref>], this study did not qualify as biomedical research involving human participants. Since it involved neither interventions on individuals nor the processing of identifiable personal data, it was exempt from ethics committee review. Consequently, under French regulations [<xref ref-type="bibr" rid="ref23">23</xref>], this type of research did not require approval from an institutional ethics board.</p></sec><sec id="s2-7"><title>Development Environment</title><p>All the analyses and development were performed in Python (Python Software Foundation) version 3.13.3, along with the following libraries and their respective versions: <italic>pandas</italic> 2.2.3, <italic>NumPy</italic> 2.2.6, <italic>seaborn</italic> 0.13.2, <italic>matplotlib</italic> 3.10.3, <italic>scikit-learn</italic> 1.6.1, <italic>statsmodels</italic> 0.14.4, <italic>SciPy</italic> 1.15.3, and <italic>UpSetPlot</italic> 0.9.0.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Performance Comparison</title><p>A total of 264 exam questions were analyzed and submitted to the 4 LLM models (ChatGPT, Gemini Pro, Le Chat, and DeepSeek). Students achieved relatively consistent results over the years, with average scores ranging from 11.59 (SD 2.18) to 14.0 (SD 1.80) (<xref ref-type="table" rid="table2">Table 2</xref>). Performance was higher on KFPs for students. The results obtained by LLMs across academic years were heterogeneous. Superior performance was achieved by ChatGPT and Gemini Pro. No significant difference was observed between the LLMs&#x2019; results and the students&#x2019; results when comparing across academic years. However, when analyzing performance by docimological component, the LLMs outperformed students in the mPCC and IQS formats (<italic>P</italic>=.049 and <italic>P</italic>=.04; respectively).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance comparison between large language models (LLMs) and students&#x2019; scores. For the LLMs, only the mean score is reported.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2"/><td align="left" valign="bottom" colspan="3">Students</td><td align="left" valign="bottom" colspan="4">LLM model</td><td align="left" valign="bottom" rowspan="2"><italic>P</italic> value</td></tr><tr><td align="left" valign="bottom">Median (IQR)</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Min-max</td><td align="left" valign="bottom">ChatGPT</td><td align="left" valign="bottom">Gemini Pro</td><td align="left" valign="bottom">Le Chat</td><td align="left" valign="bottom">DeepSeek</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Academic level</td></tr><tr><td align="left" valign="top" colspan="9"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2023</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">11.85 (10.09-13.15)</td><td align="left" valign="top">11.59 (2.18)</td><td align="left" valign="top">5.35-16.55</td><td align="left" valign="top">15.28</td><td align="left" valign="top">18.44</td><td align="left" valign="top">12.28</td><td align="left" valign="top">11.8</td><td align="left" valign="top">.05</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M2<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">12.80 (11.68-13.71)</td><td align="left" valign="top">12.64 (1.48)</td><td align="left" valign="top">7.39-18.42</td><td align="left" valign="top">14.11</td><td align="left" valign="top">16.77</td><td align="left" valign="top">12.32</td><td align="left" valign="top">12.27</td><td align="left" valign="top">.39</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M3<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">12.53 (11.62-14.03)</td><td align="left" valign="top">11.77 (4.19)</td><td align="left" valign="top">20 (0-20)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">.40</td></tr><tr><td align="left" valign="top" colspan="9"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2024</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1</td><td align="left" valign="top">14.14 (12.87-15.25)</td><td align="left" valign="top">14.0 (1.80)</td><td align="left" valign="top">7.54-15.82</td><td align="left" valign="top">15.91</td><td align="left" valign="top">14.36</td><td align="left" valign="top">15.75</td><td align="left" valign="top">14.30</td><td align="left" valign="top">.17</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M2</td><td align="left" valign="top">13.07 (11.96-14.15)</td><td align="left" valign="top">13.08 (1.63)</td><td align="left" valign="top">8.40-17.36</td><td align="left" valign="top">12.88</td><td align="left" valign="top">16.30</td><td align="left" valign="top">13.21</td><td align="left" valign="top">12.52</td><td align="left" valign="top">.61</td></tr><tr><td align="left" valign="top" colspan="9"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2025</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M2</td><td align="left" valign="top">11.68 (10.61-12.74)</td><td align="left" valign="top">11.67 (1.56)</td><td align="left" valign="top">7.35-16.46</td><td align="left" valign="top">11.32</td><td align="left" valign="top">12.38</td><td align="left" valign="top">11.79</td><td align="left" valign="top">11.96</td><td align="left" valign="top">.74</td></tr><tr><td align="left" valign="top" colspan="9">Docimological component</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PCC<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">12.67 (10.18-14.86)</td><td align="left" valign="top">12.07 (4.49)</td><td align="left" valign="top">0.0-20</td><td align="left" valign="top">13.32</td><td align="left" valign="top">16.40</td><td align="left" valign="top">13.10</td><td align="left" valign="top">11.81</td><td align="left" valign="top">.16</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>mPCC<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">11.85 (8.35-15.0)</td><td align="left" valign="top">11.35 (5.26)</td><td align="left" valign="top">0.0-20</td><td align="left" valign="top">13.16</td><td align="left" valign="top">15.14</td><td align="left" valign="top">13.00</td><td align="left" valign="top">12.78</td><td align="left" valign="top">.049</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>KFP<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">13.33 (8.0-15.33)</td><td align="left" valign="top">11.91 (5.71)</td><td align="left" valign="top">0.0-20</td><td align="left" valign="top">14.95</td><td align="left" valign="top">14.19</td><td align="left" valign="top">14.95</td><td align="left" valign="top">14.19</td><td align="left" valign="top">.16</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>IQS<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">10.14 (9.3-13.40)</td><td align="left" valign="top">10.63 (4.15)</td><td align="left" valign="top">0.0-18.4</td><td align="left" valign="top">14.14</td><td align="left" valign="top">16.22</td><td align="left" valign="top">12.71</td><td align="left" valign="top">12.49</td><td align="left" valign="top">.04</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>M1: master&#x2019;s level 1.</p></fn><fn id="table2fn2"><p><sup>b</sup>M2: master&#x2019;s level 2.</p></fn><fn id="table2fn3"><p><sup>c</sup>M3: master&#x2019;s level 3.</p></fn><fn id="table2fn4"><p><sup>d</sup>The exam cases provided to M2 and M3 students during faculty examinations are strictly identical.</p></fn><fn id="table2fn5"><p><sup>e</sup>PCC: progressive clinical cases.</p></fn><fn id="table2fn6"><p><sup>f</sup>mPCC: mini&#x2013;progressive clinical cases.</p></fn><fn id="table2fn7"><p><sup>g</sup>KFP: key feature problem.</p></fn><fn id="table2fn8"><p><sup>h</sup>IQS: isolated question sequence.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Ambiguity Score and Tag Repartition</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows the distribution of ambiguity scores by item according to academic year and docimological component. An ambiguity score of 2 was the most frequently observed, regardless of academic year or type of docimological component. Specifically, 54 items received a score of 2 in both 2023 and 2024, while 53 items had the same score in the IQS category, and 37 and 34 items in the PCC and mPCC categories, respectively. The 2024 academic year had the highest number of ambiguity scores at 0, and there was no score greater than 2 in 2023 and 2025. IQSs exhibited the highest number of scores above 2, with a total of 5 items scoring 3, but also the highest number of scores at 0, with 36 questions in total.</p><p>For the academic year 2023, the low-performance and ambiguity tags were over-represented compared to the tags inconsistency and subjective ambiguity detection, regardless of the docimological component studied (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The total ambiguity score was higher for IQS and PCC, with scores of 35 and 32, respectively. An increase in the number of subjective ambiguity detection tags was observed in 2024, while the distribution of the other tags remained unchanged. During that year, the total ambiguity score for mPCC increased to 27, compared to 10 in 2023. No PCC case was included that year. KFPs had the lowest total ambiguity score during the same period. In 2025, the various tags appeared less frequently than in previous years. The subjective ambiguity detection tag was not used, and the maximum total ambiguity score for the PCC cases was 18.</p><p>Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> shows the most common tag combinations. The most frequent tag combinations were ambiguity and incoherence (78, 29.5% items), low performance and ambiguity (52, 19.7% items), and incoherence, subjective ambiguity detection, and ambiguity (5, 1.9% items). The ambiguity tag alone appeared in 36 items, corresponding to 13.6% of cases, and the low performance tag was never found alone.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Distribution of ambiguity scores by item according to (A) academic year and (B) docimological component. No scores greater than 3 are shown, as the tagging criteria make this situation impossible. IQS: isolated question sequence<italic>;</italic> KFP: key feature problem<italic>;</italic> mPCC: mini&#x2013;progressive clinical cases; PCC: progressive clinical cases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e82702_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Tag repartition by docimological component according to (A) academic year 2023, (B) academic year 2024, and (C) academic year 2025. To increase the visibility of the differences between the distributions of the different tags, the axes of the radar plots were transformed using log1p. The total ambiguity score results from the sum of all the ambiguity scores. IQS: isolated question sequence<italic>;</italic> KFP: key feature problem<italic>;</italic> mPCC: mini&#x2013;progressive clinical cases; PCC: progressive clinical cases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e82702_fig02.png"/></fig></sec><sec id="s3-3"><title>Quality Score</title><p>All the docimological elements for the faculty exams in 2023 exceeded the validity threshold, with a qualitative score of 15 out of 20, and all of them received a &#x201C;Very good&#x201D; rating (<xref ref-type="fig" rid="figure3">Figure 3</xref>). For the 2024 M1 cohort, only 1 component, KFP1, fell below the validity threshold with a qualitative rating of &#x201C;Poor,&#x201D; whereas for the M2 cohort, 2 components were below this threshold with ratings of &#x201C;Moderate.&#x201D; Four of 6 items did not reach the 15 out of 20 threshold for the 2025 M2 cohort, with the lowest scores observed for PCC2 and mPCC1, both receiving a qualitative rating of &#x201C;Insufficient.&#x201D; <xref ref-type="fig" rid="figure3">Figure 3</xref> also shows an expansion in the diversity of docimological element types over the years, with all formats (PCC, mPCC, KFP, and IQS) represented in 2025. In 2023, and for the M1 academic exam, only the IQS and mPCC formats were included. IQSs were represented in each academic exam. Elements with a score of 15 were reviewed by the authors, who confirmed the presence of ambiguities in the wording, structure, or context.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Qualitative score distribution by academic year and cohort. (A) 2023&#x2014;M1 cohort, (B) 2023&#x2014;M2 cohort, (C) 2024&#x2014;M1 cohort, (D) 2024&#x2014;M2 cohort, and (E) 2025&#x2014;M2 cohort. IQS: isolated question sequence<italic>;</italic> KFP: key feature problem<italic>;</italic> mPCC: mini&#x2013;progressive clinical cases; PCC: progressive clinical cases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e82702_fig03.png"/></fig><p>No significant differences were found between the groups when comparing results based on quality scores according to the writer&#x2019;s specialty and career level, nor when comparing results by career level and the docimological component (Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). No significant difference in career level was found between clinical fellows and associate professors (<italic>P</italic>=.44). The difference between emergency writers and intensive care unit or critical care unit writers appeared more pronounced, approaching statistical significance (<italic>P</italic>=.08). After adjustment for career level, the quality gap between mPCC and KFPs remained more marked than for the remaining docimological types (<italic>P</italic>=.12 and <italic>P</italic>=.16, respectively). The number of contributors involved in writing the exams increased during 2025, particularly with more clinical fellows participating.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In our comparison of LLMs with average student scores, no significant differences were found across academic years. However, mean scores varied between LLMs, and additional differences appeared when analyzing performance by type of docimological component. One reason for this variation may be the specific training approach and intended purpose of each model. ChatGPT and Gemini Pro are the oldest and most widely known models. A study focusing on the responses of various LLMs to medical questions related to tuberculosis highlighted the absence of overall performance differences between the models but revealed disparities depending on the type of content submitted [<xref ref-type="bibr" rid="ref24">24</xref>]. It is important to emphasize that this study was not designed to establish the superiority of 1 specific LLM over others. Rather, building upon the methodological framework previously described by our team in G&#x00E9;rard et al [<xref ref-type="bibr" rid="ref20">20</xref>], our approach focuses on the synergy of an ensemble of models. Using multiple LLMs allows for a convergence of evidence, which increases the reliability of ambiguity detection by filtering out idiosyncratic errors or &#x201C;hallucinations&#x201D; from individual models.</p><p>There was an unequal distribution of tag types and ambiguity scores depending on the academic year and the type of assessment elements. The highest ambiguity score was observed for IQSs. One potential explanation for the disparity in distribution across academic years, though not statistically confirmed, might be differences in authorship. In 2024 and 2025, a greater number of contributors, particularly clinical fellows, were involved in writing the exams. While these younger authors are often tasked with drafting clinical case scenarios, our analysis did not reach statistical significance to definitively confirm a difference in quality based on academic rank. This lack of significance is likely due to a lack of statistical power when subdividing the 264 items by year, format, and seniority. This idea is supported by several studies. For example, Oyibo et al [<xref ref-type="bibr" rid="ref25">25</xref>] examined a cohort of 65 junior doctors and found that young professionals produced case scenarios that were rated as effective and clear in 98.5% of cases. Additionally, they were enthusiastic about the writing process. Furthermore, case writing is an effective way to develop advanced writing skills, which are particularly useful for creating clinical case scenarios [<xref ref-type="bibr" rid="ref26">26</xref>]. However, in our study, we did not find a statistically significant difference in the quality of clinical cases according to professional grade or specialty. Another explanation can be found in the type of docimological component. The construction of decontextualized single-question items is challenging and leads to a lack of validity and low rigor. Several authors describe IQSs as poor evaluators that encourage memorization rather than critical thinking, making them less discriminative [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>Our results suggest that quality scores could provide a valuable framework for evaluating the ambiguity and overall quality of academic exam cases <italic>a priori</italic>. Unlike current methods that assess items <italic>a posteriori</italic> based on student performance, our approach aims to identify &#x201C;structural fragility&#x201D; before the exam is administered. Most evaluations of these components are currently conducted retrospectively, after the exam has been administered [<xref ref-type="bibr" rid="ref29">29</xref>]. These items may be removed or may affect the exam&#x2019;s discriminatory power, leading to student frustration. Their removal is costly in terms of time and budget and undermines educators&#x2019; credibility and trust among students. Defining a score before submission could provide valuable feedback and may facilitate proactive efforts to improve clarity and revise ambiguous items [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>This work highlights the growing importance of LLMs in the evaluation and validation of academic exams. This study extends the work initiated by G&#x00E9;rard et al [<xref ref-type="bibr" rid="ref20">20</xref>] in the domain of pharmacological exams by incorporating a rigorous methodology for question evaluation, enabling real-time improvement. In contrast to their findings, our results revealed fewer statistically significant differences between the performance of LLMs and medical students. Our study encompassed a wider array of question formats, including radiological interpretation tasks and docimological components, which were not addressed in the original study. Recent studies have shown that LLMs can do more than just detect ambiguity; they can also analyze complex medical questions, identify nuances in item clarity, and provide consistent evaluative feedback [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. Furthermore, LLMs have been explored as valuable tools for designing and revising educational assessments. They improve item quality and reduce ambiguity through iterative review processes [<xref ref-type="bibr" rid="ref19">19</xref>]. Our study presented a practical application of LLMs in an understudied field of medical education. LLMs were not used to replace students in taking exams but to offer a promising approach to addressing ambiguities and docimological errors. Our methodology adopts a &#x201C;human-in-the-loop&#x201D; approach, where LLMs serve as automated screening tools rather than final judges of construct validity. By flagging questions with low performance or high ambiguity, the system is intended to assist pedagogical experts in concentrating their review on items most likely to contain docimological errors. This synergy between automated detection and human expertise ensures that the distinction between a &#x201C;challenging but fair&#x201D; question and an &#x201C;ambiguous&#x201D; one is ultimately determined by the educator.</p><p>Given the pilot nature of this study and the specific context of critical care assessments, these results should be interpreted with caution. While the trends observed are encouraging, they represent preliminary insights that require validation through larger, multicentric studies across diverse medical disciplines.</p></sec><sec id="s4-2"><title>Strengths</title><p>This was an original research study in terms of both its thematic focus and its methodological design. We introduced a fully objective tagging methodology aimed at reducing the risk of bias associated with human interpretation. This approach has the potential to be applied beyond the fields of critical care and emergency medicine. The academic exams at the Medical School of Universit&#x00E9; C&#x00F4;te d&#x2019;Azur incorporated a variety of docimological formats, including KFP, PCC, mPCC, IQS, MCQ, single-best-answer questions, and SAQ. This study enabled an evaluation of this broad spectrum and an analysis of the various types of errors that may occur during the design of such items. Including this diversity enhances external validity and supports its applicability in real-world educational settings. Comparative analyses with students are often lacking in AI-related studies. Similar to the approach taken by G&#x00E9;rard et al [<xref ref-type="bibr" rid="ref20">20</xref>], conducting such an analysis provides additional insight into the difficulty level of the assessed questions. This approach allows for a type of <italic>a priori</italic> evaluation, in which the exams are assessed before students take them. This is different from <italic>a posteriori</italic> methods, which rely on student performance data. Thus, this <italic>a priori</italic> evaluation could significantly reduce the logistical and human resources required for postexamination modifications and reviews while supporting fairness among students. To ensure maximum objectivity, we selected the four most widely used LLMs: ChatGPT, Gemini, Le Chat, and DeepSeek. This choice reduced the risk of systematic errors inherent to a single model and was intended to enhance the robustness of the results by converging ambiguity detection across multiple models. Furthermore, no prompt engineering was used to maintain the most realistic and standardized conditions possible.</p></sec><sec id="s4-3"><title>Limitations</title><p>One of the main limitations of this study was the absence of a systematic human gold standard for the entire dataset. Consequently, we must acknowledge that some ambiguities identified by the LLMs may reflect intrinsic model limitations or reasoning issues rather than actual flaws in the questions. To mitigate this risk of &#x201C;model noise,&#x201D; we used an ensemble strategy where ambiguity is only flagged upon the disagreement of multiple independent architectures. Rather than definitive proofs of error, these scores should be viewed as objective indicators of an item&#x2019;s structural fragility, designed to guide educators toward the questions most in need of manual refinement. Nevertheless, we used four different LLMs to minimize this potential bias. We only used general-purpose models for the analysis of exam questions, although it is known that domain-specific LLMs, such as Med-PaLM 2, exist [<xref ref-type="bibr" rid="ref34">34</xref>]. These specialized models may demonstrate superior reasoning abilities in complex clinical scenarios and a greater capacity to detect ambiguities in medical questions. However, this would reduce the generalizability of the methodology to domains outside of medicine. Furthermore, the absence of task-specific tuning and the reliance on the generative intuition of LLMs may introduce certain biases. As this study did not use few-shot examples or chain-of-thought prompting to standardize the reasoning process, the &#x201C;ambiguity scores&#x201D; should be interpreted as indicators of potential fragility rather than definitive proofs of item flaws. Future research should investigate whether advanced prompting techniques or fine-tuning on human-validated &#x201C;gold standards&#x201D; can further isolate model noise from item-specific ambiguity. The monocentric nature of the study population, which is limited to the field of intensive care and emergency medicine, may limit the generalizability of the results. However, the methodology is transferable and can be applied to other specialties. To minimize this bias and avoid focusing on a single student cohort, we included multiple academic years, thereby introducing variation among authors. The use of the &#x201C;subjective ambiguity&#x201D; tag may have led to confusion and introduced interpretation bias, as its definition relies on LLMs&#x2019; ability to detect ambiguity. This undermines transparency, as the criteria were defined by the models themselves and were not externally verifiable, thereby increasing the risk of a &#x201C;black box&#x201D; effect. While our thresholds were derived from data-driven inflection points, we acknowledge that sensitivity analyses across different medical specialties would further validate the stability and generalizability of these assignments. Furthermore, our analysis of quality based on author seniority and specialty may have been limited by a lack of statistical power. Although the total sample of 264 questions is substantial, the fragmentation into multiple categories (academic years, docimological formats, and authors) resulted in small effect sizes that were insufficient to reach statistical significance.</p></sec><sec id="s4-4"><title>Future Work</title><p>To ensure the external validity of this methodology, our next step involves a large-scale, multicenter validation study spanning multiple medical specialties. Future research will focus on the external validation of these scores by correlating LLM-generated fragility signals with student psychometric data, such as bimodal response distributions or item discrimination indices. This will allow us to establish the predictive validity of our <italic>a priori</italic> tool against the <italic>a posteriori</italic> gold standard of student performance. Both qualitative and quantitative assessments of examinations will be implemented before and after the validation tool is deployed. Once validated, the system will be adopted for the <italic>a priori</italic> evaluation of faculty examination quality at the local level using LLMs. An online platform will provide easy access to this solution, facilitating its integration into academic routines.</p></sec><sec id="s4-5"><title>Conclusions</title><p>Our findings suggest a potential role that LLMs can play in medical education. We analyzed 264 exam questions across 3 academic years using four different LLMs. A standardized evaluation process was applied, combining automated scoring with 4 binary diagnostic tags to generate composite ambiguity and weighted quality scores. This methodology enabled a more standardized assessment of both individual questions and entire exam components. Overall, the models achieved performance levels comparable to students, with significantly higher scores on mPCC and IQS formats. IQS items exhibited the highest ambiguity scores, frequently associated with incoherence and low-performance tags. Ambiguity scores varied by year, with a predominance of moderate ambiguity (score 2) and occasional strong signals in IQS. Across all cohorts, IQS items were consistently represented, while the 2025 cohort included the full range of docimological formats. No significant differences in exam quality were observed based on the authors&#x2019; specialty or academic rank. Given the exploratory and monocentric nature of this pilot study, these results represent hypothesis-generating observations that necessitate further prospective validation.</p><p>LLMs act as intelligent assistants in a human-in-the-loop framework, flagging potential issues for expert review to improve the overall clarity and fairness of medical assessments. As these technologies evolve, more research is needed to explore their applications across disciplines and institutions.</p></sec></sec></body><back><ack><p>The authors would like to thank the medical students of Universit&#x00E9; C&#x00F4;te d&#x2019;Azur, whose examination results were analyzed in this study. The authors are also grateful to the professors and instructors who generously provided their examinations and official answer keys, which were essential to our analyses.</p></ack><notes><sec><title>Funding</title><p>No external financial support or grants were received from any public, commercial, or not-for-profit entities for the research, authorship, or publication of this article.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due to legal restrictions prohibiting the dissemination of faculty examination materials; however, they are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: RL, MJ, AOG, AD, JD</p><p>Data curation: RL, MJ, AOG, AD, MJ, JD</p><p>Formal analysis: RL</p><p>Investigation: RL, MJ</p><p>Methodology: RL, MJ, AOG</p><p>Supervision: MJ, AOG, AD, JD</p><p>Validation: MJ, AOG, AD, JD</p><p>Writing &#x2013; original draft: RL</p><p>Writing &#x2013; review &#x0026; editing: MJ, AOG, AD, JD</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">IQS</term><def><p>isolated question sequence</p></def></def-item><def-item><term id="abb3">KFP</term><def><p>key feature problem</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MCQ</term><def><p>multiple-choice question</p></def></def-item><def-item><term id="abb6">mPCC</term><def><p>mini&#x2013;progressive clinical cases</p></def></def-item><def-item><term id="abb7">PCC</term><def><p>progressive clinical cases</p></def></def-item><def-item><term id="abb8">SAQ</term><def><p>short-answer question</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omiye</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Gui</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rezaei</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name></person-group><article-title>Large language models in medicine: the potentials and pitfalls: a narrative review</article-title><source>Ann Intern Med</source><year>2024</year><month>02</month><volume>177</volume><issue>2</issue><fpage>210</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.7326/M23-2772</pub-id><pub-id pub-id-type="medline">38285984</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Aleta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Du</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Moreno</surname><given-names>Y</given-names> </name></person-group><article-title>LLMs and generative agent-based models for complex systems research</article-title><source>Phys Life Rev</source><year>2024</year><month>12</month><volume>51</volume><fpage>283</fpage><lpage>293</lpage><pub-id pub-id-type="doi">10.1016/j.plrev.2024.10.013</pub-id><pub-id pub-id-type="medline">39486377</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ouyang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Training language models to follow instructions with human feedback</article-title><conf-name>36th Conference on Neural Information Processing Systems (NeurIPS 2022)</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><pub-id pub-id-type="doi">10.52202/068431-2011</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Gemini Team</collab><name name-style="western"><surname>Anil</surname><given-names>R</given-names> </name><name name-style="western"><surname>Borgeaud</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gemini: a family of highly capable multimodal models</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2312.11805</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>DeepSeek-AI</collab><name name-style="western"><surname>Liu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>B</given-names> </name><etal/></person-group><article-title>DeepSeek-V2: a strong, economical, and efficient mixture-of-experts language model</article-title><source>arXiv</source><comment>Preprint posted online on  May 7, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2405.04434</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2310.06825</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Briganti</surname><given-names>G</given-names> </name></person-group><article-title>How ChatGPT works: a mini review</article-title><source>Eur Arch Otorhinolaryngol</source><year>2024</year><month>03</month><volume>281</volume><issue>3</issue><fpage>1565</fpage><lpage>1569</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08337-7</pub-id><pub-id pub-id-type="medline">37991499</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Schuster</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Google&#x2019;s neural machine translation system: bridging the gap between human and machine translation</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 26, 2016</comment><pub-id pub-id-type="doi">10.48550/ARXIV.1609.08144</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tworek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jun</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Evaluating large language models trained on code</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 7, 2021</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2107.03374</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chung</surname><given-names>P</given-names> </name><name name-style="western"><surname>Fong</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Aghaeepour</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yetisgen</surname><given-names>M</given-names> </name><name name-style="western"><surname>O&#x2019;Reilly-Shah</surname><given-names>VN</given-names> </name></person-group><article-title>Large language model capabilities in perioperative risk prediction and prognostication</article-title><source>JAMA Surg</source><year>2024</year><month>08</month><day>1</day><volume>159</volume><issue>8</issue><fpage>928</fpage><lpage>937</lpage><pub-id pub-id-type="doi">10.1001/jamasurg.2024.1621</pub-id><pub-id pub-id-type="medline">38837145</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madden</surname><given-names>MG</given-names> </name><name name-style="western"><surname>McNicholas</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Laffey</surname><given-names>JG</given-names> </name></person-group><article-title>Assessing the usefulness of a large language model to query and summarize unstructured medical notes in intensive care</article-title><source>Intensive Care Med</source><year>2023</year><month>08</month><volume>49</volume><issue>8</issue><fpage>1018</fpage><lpage>1020</lpage><pub-id pub-id-type="doi">10.1007/s00134-023-07128-2</pub-id><pub-id pub-id-type="medline">37338549</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shekhar</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Kimbrell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Saharan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stebel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ashley</surname><given-names>E</given-names> </name><name name-style="western"><surname>Abbott</surname><given-names>EE</given-names> </name></person-group><article-title>Use of a large language model (LLM) for ambulance dispatch and triage</article-title><source>Am J Emerg Med</source><year>2025</year><month>03</month><volume>89</volume><fpage>27</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.12.032</pub-id><pub-id pub-id-type="medline">39675178</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Balta</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Javidan</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Walser</surname><given-names>E</given-names> </name><name name-style="western"><surname>Arntfield</surname><given-names>R</given-names> </name><name name-style="western"><surname>Prager</surname><given-names>R</given-names> </name></person-group><article-title>Evaluating the appropriateness, consistency, and readability of ChatGPT in critical care recommendations</article-title><source>J Intensive Care Med</source><year>2025</year><month>02</month><volume>40</volume><issue>2</issue><fpage>184</fpage><lpage>190</lpage><pub-id pub-id-type="doi">10.1177/08850666241267871</pub-id><pub-id pub-id-type="medline">39118320</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ben&#x00ED;tez</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Boudreau</surname><given-names>JD</given-names> </name><etal/></person-group><article-title>Harnessing the potential of large language models in medical education: promise and pitfalls</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>02</month><day>16</day><volume>31</volume><issue>3</issue><fpage>776</fpage><lpage>783</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad252</pub-id><pub-id pub-id-type="medline">38269644</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shool</surname><given-names>S</given-names> </name><name name-style="western"><surname>Adimi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saboori Amleshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bitaraf</surname><given-names>E</given-names> </name><name name-style="western"><surname>Golpira</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tara</surname><given-names>M</given-names> </name></person-group><article-title>A systematic review of large language model (LLM) evaluations in clinical medicine</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>03</month><day>7</day><volume>25</volume><issue>1</issue><fpage>117</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-02954-4</pub-id><pub-id pub-id-type="medline">40055694</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guthrie</surname><given-names>E</given-names> </name><name name-style="western"><surname>Levy</surname><given-names>D</given-names> </name><name name-style="western"><surname>Del Carmen</surname><given-names>G</given-names> </name></person-group><article-title>The operating and anesthetic reference assistant (OARA): a fine-tuned large language model for resident teaching</article-title><source>Am J Surg</source><year>2024</year><month>08</month><volume>234</volume><fpage>28</fpage><lpage>34</lpage><pub-id pub-id-type="doi">10.1016/j.amjsurg.2024.02.016</pub-id><pub-id pub-id-type="medline">38365551</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hadi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nagarajan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kirpalani</surname><given-names>A</given-names> </name></person-group><article-title>Evaluation of ChatGPT as a diagnostic tool for medical learners and clinicians</article-title><source>PLoS One</source><year>2024</year><volume>19</volume><issue>7</issue><fpage>e0307383</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0307383</pub-id><pub-id pub-id-type="medline">39083523</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Application of large language models in medical training evaluation-using ChatGPT as a standardized patient: multimetric assessment</article-title><source>J Med Internet Res</source><year>2025</year><month>01</month><day>1</day><volume>27</volume><fpage>e59435</fpage><pub-id pub-id-type="doi">10.2196/59435</pub-id><pub-id pub-id-type="medline">39742453</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maitland</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fowkes</surname><given-names>R</given-names> </name><name name-style="western"><surname>Maitland</surname><given-names>S</given-names> </name></person-group><article-title>Can ChatGPT pass the MRCP (UK) written examinations? Analysis of performance and errors using a clinical decision-reasoning framework</article-title><source>BMJ Open</source><year>2024</year><month>03</month><day>15</day><volume>14</volume><issue>3</issue><fpage>e080558</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2023-080558</pub-id><pub-id pub-id-type="medline">38490655</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00E9;rard</surname><given-names>AO</given-names> </name><name name-style="western"><surname>Merino</surname><given-names>D</given-names> </name><name name-style="western"><surname>Labriffe</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating and leveraging large language models in clinical pharmacology and therapeutics assessment: from exam takers to exam shapers</article-title><source>Br J Clin Pharmacol</source><year>2025</year><month>06</month><day>10</day><volume>10</volume><pub-id pub-id-type="doi">10.1002/bcp.70137</pub-id><pub-id pub-id-type="medline">40495266</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen Aubart</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lhote</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hertig</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Progressive clinical case-based multiple-choice questions: an innovative way to evaluate and rank undergraduate medical students</article-title><source>Rev Med Interne</source><year>2021</year><month>05</month><volume>42</volume><issue>5</issue><fpage>302</fpage><lpage>309</lpage><pub-id pub-id-type="doi">10.1016/j.revmed.2020.11.006</pub-id><pub-id pub-id-type="medline">33518414</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Code de la sant&#x00E9; publique [Article in French]</article-title><source>L&#x00E9;gifrance (R&#x00C9;PUBLIQUE FRAN&#x00C7;AISE Libert&#x00E9;, &#x00C9;galit&#x00E9;, Fraternit&#x00E9;)</source><access-date>2026-05-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.legifrance.gouv.fr/loda/id/LEGISCTA000006190263">https://www.legifrance.gouv.fr/loda/id/LEGISCTA000006190263</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="report"><article-title>Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016 on the protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing directive 95/46/EC (General Data Protection Regulation) (text with EEA relevance)</article-title><year>2016</year><access-date>2026-05-15</access-date><publisher-name>European Union</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://eur-lex.europa.eu/eli/reg/2016/679/oj/eng">https://eur-lex.europa.eu/eli/reg/2016/679/oj/eng</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dastani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mardaneh</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rostamian</surname><given-names>M</given-names> </name></person-group><article-title>Large language models&#x2019; capabilities in responding to tuberculosis medical questions: testing ChatGPT, Gemini, and Copilot</article-title><source>Sci Rep</source><year>2025</year><month>05</month><day>23</day><volume>15</volume><issue>1</issue><fpage>18004</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-03074-9</pub-id><pub-id pub-id-type="medline">40410343</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oyibo</surname><given-names>SO</given-names> </name></person-group><article-title>Developing a beginner&#x2019;s guide to writing a clinical case report: a pilot evaluation by junior doctors</article-title><source>Cureus</source><year>2019</year><month>12</month><day>13</day><volume>11</volume><issue>12</issue><fpage>e6370</fpage><pub-id pub-id-type="doi">10.7759/cureus.6370</pub-id><pub-id pub-id-type="medline">31886095</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oyibo</surname><given-names>SO</given-names> </name><name name-style="western"><surname>Brij</surname><given-names>SO</given-names> </name></person-group><article-title>A medical article publishing club for junior doctors: a quantitative and qualitative analysis</article-title><source>Cureus</source><year>2018</year><month>12</month><day>7</day><volume>10</volume><issue>12</issue><fpage>e3701</fpage><pub-id pub-id-type="doi">10.7759/cureus.3701</pub-id><pub-id pub-id-type="medline">30788190</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schuwirth</surname><given-names>LWT</given-names> </name><name name-style="western"><surname>van der Vleuten</surname><given-names>CPM</given-names> </name></person-group><article-title>Different written assessment methods: What can be said about their strengths and weaknesses?</article-title><source>Med Educ</source><year>2004</year><month>09</month><volume>38</volume><issue>9</issue><fpage>974</fpage><lpage>979</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2929.2004.01916.x</pub-id><pub-id pub-id-type="medline">15327679</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walsh</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>BHL</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>PE</given-names> </name></person-group><article-title>Single best answer question&#x2013;writing tips for clinicians</article-title><source>Postgrad Med J</source><year>2017</year><month>02</month><volume>93</volume><issue>1096</issue><fpage>76</fpage><lpage>81</lpage><pub-id pub-id-type="doi">10.1136/postgradmedj-2015-133893</pub-id><pub-id pub-id-type="medline">27371033</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rezigalla</surname><given-names>AA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Firstenberg</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Stawicki</surname><given-names>SP</given-names> </name></person-group><article-title>Item analysis: concept and application</article-title><source>Medical Education for the 21st Century</source><year>2022</year><publisher-name>IntechOpen</publisher-name><fpage>1</fpage><lpage>17</lpage><pub-id pub-id-type="doi">10.5772/intechopen.100138</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>I&#x00F1;arrairaegui</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fern&#x00E1;ndez-Ros</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lucena</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Evaluation of the quality of multiple-choice questions according to the students&#x2019; academic level</article-title><source>BMC Med Educ</source><year>2022</year><month>11</month><day>11</day><volume>22</volume><issue>1</issue><fpage>779</fpage><pub-id pub-id-type="doi">10.1186/s12909-022-03844-3</pub-id><pub-id pub-id-type="medline">36369070</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boateng</surname><given-names>GO</given-names> </name><name name-style="western"><surname>Neilands</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Frongillo</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Melgar-Qui&#x00F1;onez</surname><given-names>HR</given-names> </name><name name-style="western"><surname>Young</surname><given-names>SL</given-names> </name></person-group><article-title>Best practices for developing and validating scales for health, social, and behavioral research: a primer</article-title><source>Front Public Health</source><year>2018</year><volume>6</volume><fpage>149</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2018.00149</pub-id><pub-id pub-id-type="medline">29942800</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Carignan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2303.13375</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id><pub-id pub-id-type="medline">39779926</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Upset plot of tag combinations.</p><media xlink:href="mededu_v12i1e82702_app1.png" xlink:title="PNG File, 279 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Comparison of quality scores between career levels, specialties, and docimological components.</p><media xlink:href="mededu_v12i1e82702_app2.xlsx" xlink:title="XLSX File, 10 KB"/></supplementary-material></app-group></back></article>