<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e86208</article-id><article-id pub-id-type="doi">10.2196/86208</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>AI-Generated Multiple Mini Interview (MMI) Stations for Medical School Admissions: Psychometric Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hamila</surname><given-names>Sabryn</given-names></name><degrees>BSc (Hons), PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Birchill</surname><given-names>Kyle</given-names></name><degrees>BCom</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cao</surname><given-names>Khoa</given-names></name><degrees>MBBS (Hons)</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hossain</surname><given-names>Md Nassif</given-names></name><degrees>BMedSc, MPH, GCTE</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bullock</surname><given-names>Shane</given-names></name><degrees>BSc (Hons), PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hodgson</surname><given-names>Wayne</given-names></name><degrees>BSc, Graduate Certificate in Higher Education, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Harrison</surname><given-names>Julia</given-names></name><degrees>BMedSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Leech</surname><given-names>Michelle</given-names></name><degrees>MBBS (Hons), PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>School of Medicine, Faculty of Medicine, Nursing and Health Sciences, Monash University</institution><addr-line>27 Rainforest Walk</addr-line><addr-line>Clayton</addr-line><country>Australia</country></aff><aff id="aff2"><institution>School of Public Health and Preventive Medicine, Faculty of Medicine, Nursing and Health Sciences, Monash University</institution><addr-line>Clayton</addr-line><country>Australia</country></aff><aff id="aff3"><institution>School of Rural Health, Faculty of Medicine, Nursing and Health Sciences, Monash University</institution><addr-line>Warragul</addr-line><addr-line>Victoria</addr-line><country>Australia</country></aff><aff id="aff4"><institution>Sub-Faculty of Health Sciences, Faculty of Medicine, Nursing and Health Sciences, Monash University</institution><addr-line>Clayton</addr-line><addr-line>Victoria</addr-line><country>Australia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Montagna</surname><given-names>Marco</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Su</surname><given-names>Chen-Yang</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Batacan</surname><given-names>Romeo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sabryn Hamila, BSc (Hons), PhD, School of Medicine, Faculty of Medicine, Nursing and Health Sciences, Monash University, 27 Rainforest Walk, Clayton, 3800, Australia, 61 404065979; <email>sabryn.hamila1@monash.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>11</day><month>6</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e86208</elocation-id><history><date date-type="received"><day>20</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>12</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>07</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Sabryn Hamila, Kyle Birchill, Khoa Cao, Md Nassif Hossain, Shane Bullock, Wayne Hodgson, Julia Harrison, Michelle Leech. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 11.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2026/1/e86208"/><abstract><sec><title>Background</title><p>Multiple mini interviews (MMIs) are widely used in medical school admissions to assess applicants&#x2019; nonacademic attributes in a structured and reliable manner. However, the development of high-quality MMI stations is resource intensive and dependent on expert input.</p></sec><sec><title>Objective</title><p>This study explored the utility of artificial intelligence (AI) in the generation of MMI stations for the Direct and Graduate Entry Medicine Program admissions process for domestic applicants at Monash Medical School. To our knowledge, this study represents the first empirical evaluation of AI-generated MMI stations deployed in a real-world medical school admissions context.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 56 MMI stations from the 2025 admissions cycle were evaluated, including 17 (30.4%) AI-generated and 39 (69.6%) traditionally developed stations, administered across 824 domestic applicants for a total of 4897 applicant-station interactions. We assessed station quality through both reliability (using Cronbach &#x03B1; to examine internal consistency) and discrimination capability (using SD and range of scores) at the station level.</p></sec><sec sec-type="results"><title>Results</title><p>AI-generated stations exhibited slightly higher reliability (&#x03B1;=0.82) compared with traditional stations (&#x03B1;=0.81), though this difference was not statistically significant (<italic>P</italic>=.91). Both AI-generated and traditionally developed stations demonstrated variable discrimination capability, with some stations from each development method showing excellent combinations of high reliability and strong discriminatory power, while others exhibited ceiling effects that limited their discriminatory power. Of note, a greater proportion of AI-generated stations were classified as optimal (&#x03B1;&#x003E;0.85), and a smaller proportion were classified in the review category (&#x03B1;&#x003C;0.75), compared with traditional stations. These results suggest that AI-generated stations can achieve psychometric performance comparable to traditionally developed stations.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings highlight the utility of AI as a useful tool for MMI station generation, offering a scalable approach that may reduce the resource burden on faculty while maintaining or enhancing psychometric quality for applicants. Ongoing quality assurance and evaluation remain essential to ensure fairness and validity across the admissions process.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>medical school admissions</kwd><kwd>medical education</kwd><kwd>multiple mini interviews</kwd><kwd>MMI</kwd><kwd>assessment reliability</kwd><kwd>educational technology</kwd><kwd>generative artificial intelligence</kwd><kwd>generative AI</kwd><kwd>psychometric evaluation</kwd><kwd>medical education innovation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Medical school admissions are evolving to meet the demands of a changing health care landscape. With increasing competition and a growing emphasis on nonacademic traits that predict professional competence and patient care quality, new tools are reshaping how candidates are selected. The multiple mini interview (MMI) format has been widely adopted in medical school admissions and can assess a range of nonacademic skills, such as communication, ethical reasoning, teamwork, and empathy [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. MMI stations typically consist of a stem or prompt that outlines the scenario, followed by several structured questions that candidates must respond to in real time and that are designed to probe different aspects of the candidate&#x2019;s reasoning or behavior, testing their abilities to think critically, demonstrate perspective and emotional intelligence, and navigate complex interpersonal dynamics [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Despite their widespread adoption, the development of high-quality MMI stations remains challenging. The intense competition for limited places requires a large and continually refreshed pool of MMI stations to maintain test security and ensure fairness for all applicants. Developing and renewing this pool through traditional methods is resource intensive and places a considerable burden on faculty, while rushed or inconsistent development across multiple contributors can compromise station quality and reliability [<xref ref-type="bibr" rid="ref6">6</xref>]. Artificial intelligence (AI) offers a potential solution to these challenges by streamlining station development and supporting the creation of a more reliable and scalable pool of assessment tools [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Prior work has shown that AI can generate assessment items or scenarios with acceptable quality or psychometric properties [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. However, to date, no published studies have systematically evaluated the effectiveness of AI-generated MMI stations in real-world admissions settings.</p><p>In this study, we performed a comprehensive reliability analysis on 17 new MMI stations generated with the assistance of AI (AI-generated stations) and 39 existing MMI stations developed from non-AI sources used in the selection process for the 2025 intake of domestic students into the Direct Entry and Graduate Entry Medicine Program at Monash University. Using Cronbach &#x03B1; to assess internal consistency and SD as well as range of scores to evaluate discriminative capability, our analysis aimed to explore the reliability and overall quality of individual stations, uncover differences in performance between AI-generated and traditional stations, and identify stations that warrant modification, replacement, or serve as models for future development. These insights can be used to guide evidence-based improvements in MMI design and the integration of AI in medical admissions.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This study analyzed 56 MMI stations used in the selection process for the 2025 admissions cycle of domestic applicants into the Direct Entry and Graduate Entry Medicine Program at Monash University. The 56 stations were divided into two categories: (1) AI-generated stations (n=17, 30.4%), and (2) existing or traditional stations (n=39, 69.6%). Each station was mapped to one of 7 MMI domains: advocacy, collaboration, critical thinking, empathy, ethical reasoning, motivation, and resilience. These domains map to desirable qualities of successful applicants and are used to guide the focus of the stations. They are not statistically determined discrete constructs within the MMI.</p><p>Traditional MMI stations were existing scenarios previously used in admissions and developed through established faculty-led processes. AI-generated MMI stations were developed using Claude 3.5 Sonnet (Anthropic), a proprietary, closed-source AI platform. Claude was selected for this study primarily because of its data privacy policy at the time of the project, which specified that user inputs were not retained or used for model training. Given that examination materials were included in prompts, this provided additional assurance regarding the protection of assessment content and institutional data.</p><p>The AI platform was provided with the following user-generated prompt:</p><disp-quote><p>Please generate an interview based on the provided document. The interviews are for high-school students who are applying for medical school and test the student&#x2019;s capacity for critical thinking. The interview structure should have a single paragraph for the applicant scenario, a single paragraph describing key components of the scenario for interviewers and 5 questions, with dot points on what an excellent response should look like.</p></disp-quote><p>We designed this prompt to elicit realistic, contextually appropriate, and assessment-relevant content for medical school admissions, and guided the AI in producing full scenarios, associated interview questions, and example candidate responses.</p><p>To ensure content validity, relevance, and alignment with assessment objectives, all AI-generated scenarios underwent a structured quality assurance process. A working group of academic staff involved in admissions across both the Australian and Malaysian campuses reviewed the AI-generated stations as part of this process. Each scenario was independently reviewed by 2 faculty members from this working group with expertise in medical education and admissions. Reviewers evaluated alignment with the intended competency domain, clarity of the scenario, and potential sources of bias or ambiguity. Feedback from these reviews was incorporated through iterative refinement of prompts and station content. Final approval was contingent upon a consensus decision made jointly by 2 senior academic leads, providing an additional layer of oversight and quality control. While this process mitigates some risks inherent to AI-generated content, it does not eliminate potential biases associated with proprietary large language models, which lack transparency regarding training data and internal parameters [<xref ref-type="bibr" rid="ref10">10</xref>]. Example traditional and AI-generated MMI stations from the collaboration domain, including scenario prompts and structured questions, are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-2"><title>Data Collection</title><p>Data were obtained from MMIs conducted between September 2024 and January 2025 on 824 domestic applicants, yielding a total of 4897 applicant-station interactions. Applicants were randomly assigned to a subset of the 56 available stations (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>), which were distributed across multiple interview days. Each MMI station was scored on a 20-point scale, consisting of 5 questions with each question scored from 0 to 4 points.</p><p>A total of 252 interviewers participated in the interview process across both direct and graduate entry admissions. All interviewers completed standardized training prior to the MMI cycle. Interviewers were drawn from a range of backgrounds including health professionals, academics, community representatives, and occasionally senior students or recent graduates. Interviewers were recruited through an expression-of-interest process and screened for eligibility and potential conflicts of interest. Training included orientation to the MMI process, a scoring framework, and interviewer responsibilities through preparatory materials and live workshops. On each interview day, interviewers also participated in station-specific calibration sessions with other assessors assigned to the same station to promote consistent interpretation of scoring criteria.</p><p>Each candidate was assessed by a single trained interviewer at each MMI station. Multiple interviewers were assigned to each station across interview sessions, with the number of interviewers per station ranging from 2 to 17 depending on candidate volume per station. As each performance was rated by a single interviewer, interinterviewer reliability and interviewer variance were not examined in this analysis. To mitigate the risk of content dissemination between applicants, different stations were scheduled on different days.</p></sec><sec id="s2-3"><title>Reliability Analysis</title><p>To assess the reliability of individual MMI stations, we calculated Cronbach &#x03B1; for each station using applicant responses from the 2025 admissions cycle. Cronbach &#x03B1; ranges from 0 to 1 and is a measure of internal consistency; it reflects the extent to which items within an assessment measure the same underlying construct or set of related competencies [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. In the context of a 5-item MMI station, higher &#x03B1; values indicate greater coherence among the scored questions, whereas lower values may suggest item misalignment, scoring inconsistency, or other sources of measurement error. Because &#x03B1; was calculated within each station in this study, it reflects the reliability of that station&#x2019;s scoring rubric rather than the reliability of the overall MMI decision.</p><p>The categorization of Cronbach &#x03B1; scores for evaluating reliability remains a subject of debate, with many experts agreeing that thresholds should be determined by the purpose and stakes of the assessment. As Downing [<xref ref-type="bibr" rid="ref13">13</xref>] noted, high-stakes assessments, such as licensure or certification examinations in medicine (which carry significant consequences for both candidates and the public), are generally expected to demonstrate reliability coefficients of 0.90 or above [<xref ref-type="bibr" rid="ref13">13</xref>]. For assessments of moderate stakes, such as major summative examinations in medical school, a minimum reliability threshold of 0.80 to 0.89 is typically considered acceptable [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Lower-stakes assessments, including formative evaluations or internally developed tests used in teaching, may warrant reliability in the range of 0.70 to 0.79 [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Medical school admissions processes would generally be considered moderate- to high-stakes assessments, given their significant impact on applicants&#x2019; future careers and the responsibility of selecting candidates suited for a profession with substantial societal consequences. However, station-level reliability estimates in MMI formats must also be interpreted in light of the short rating scales used to score each station (5 items) and the multidimensional nature of competencies assessed within individual stations.</p><p>In line with this framework, we classified stations in this study into 3 conservative reliability groups as outlined in <xref ref-type="table" rid="table1">Table 1</xref>. These categories were used to support descriptive comparison and structured quality assurance rather than as binary standards of station usability. In this context, both optimal and acceptable meet expected reliability, while review indicates a station requires further examination.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Reliability classification of multiple mini interview stations based on Cronbach &#x03B1; scores.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Reliability category<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Cronbach &#x03B1;</td></tr></thead><tbody><tr><td align="left" valign="top">Optimal</td><td align="left" valign="top">&#x2265;0.85</td></tr><tr><td align="left" valign="top">Acceptable</td><td align="left" valign="top">0.75&#x2010;0.84</td></tr><tr><td align="left" valign="top">Review</td><td align="left" valign="top">&#x003C;0.75</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Reliability categories were used for descriptive comparison and structured quality assurance review and do not represent binary standards of station usability.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-4"><title>Discrimination Ability Analysis</title><p>In addition to reliability, we also evaluated station quality through discrimination capability, which was measured using the mean score, SD, and observed score range for each station. Discrimination refers to a station&#x2019;s ability to distinguish or differentiate between candidates of varying competency levels across the scoring spectrum. Stations with adequate discrimination capability demonstrate higher SDs and broader score ranges, indicating they can effectively separate high-performing candidates from those with lower performance. Conversely, stations with poor discrimination capability exhibit low SDs or restricted score ranges, often due to ceiling effects (where most candidates score near the maximum) or floor effects (where most candidates score near the minimum). Such restricted score distributions limit a station&#x2019;s utility for selection decisions, as they fail to provide meaningful discrimination between candidates regardless of their actual competency levels. While reliability ensures measurement consistency, discrimination capability determines whether that consistent measurement provides useful information for distinguishing candidate performance.</p><p>Reliability and discrimination metrics were analyzed as distinct psychometric properties but are reported together where appropriate to support integrated evaluation of station quality.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>All statistical analyses were performed using R (version 4.5.1; R Foundation for Statistical Computing), primarily using the psych package for reliability analysis (Revelle, 2025; version 2.5.6) [<xref ref-type="bibr" rid="ref14">14</xref>]. Figures were generated using GraphPad Prism (version 10.4.1; Dotmatics).</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study was conducted within the context of the standard medical school admissions process and aligned with routine quality assurance and program evaluation activities. Ethics approval was obtained from the Monash University Human Research Ethics Committee (49569). All applicant data were fully deidentified prior to analysis to protect participant confidentiality. As the study involved secondary use of deidentified administrative data, no additional consent from applicants was required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>An overview of the internal consistency reliability assessed using Cronbach &#x03B1; and average scores, including the SD and range, for all 56 MMI stations is presented in <xref ref-type="table" rid="table2">Table 2</xref>. The following subsections outline key patterns and comparisons observed across station types and domains.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Applicant scores and Cronbach &#x03B1; reliability rating for each of the 56 multiple mini interview stations used in the 2025 intake of domestic students at Monash University<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Station code</td><td align="left" valign="top">Station type</td><td align="left" valign="top">Score, mean (SD)</td><td align="left" valign="top">Range</td><td align="left" valign="top">Cronbach &#x03B1;</td><td align="left" valign="top">Category</td></tr></thead><tbody><tr><td align="left" valign="top">ADV_1</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.1 (3.0)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.83</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ADV_7</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.8 (3.2)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.82</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ADV_11</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.7 (2.8)</td><td align="left" valign="bottom">10-20</td><td align="left" valign="top">0.78</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ADV_12</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.8 (2.8)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.77</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ADV_13<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.2 (3.2)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.85</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ADV_15</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.7 (2.9)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.79</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ADV_17</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.8 (3.3)</td><td align="left" valign="bottom">5-20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ADV_18</td><td align="left" valign="top">Existing</td><td align="left" valign="top">12.6 (4.1)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.89</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ADV_19</td><td align="left" valign="top">AI<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> generated</td><td align="left" valign="top">11.2 (4.0)</td><td align="left" valign="bottom">0-19</td><td align="left" valign="top">0.85</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ADV_20</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">14.5 (3.2)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.78</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ADV_21<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">AI generated</td><td align="left" valign="top">17.3 (2.3)</td><td align="left" valign="bottom">14-20</td><td align="left" valign="top">0.75</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_1</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.8 (2.9)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.83</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_2</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.6 (3.0)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.76</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_3<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.0 (3.1)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.84</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_4<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.6 (3.2)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.82</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_9</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.0 (2.7)</td><td align="left" valign="bottom">10-20</td><td align="left" valign="top">0.77</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_10</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.0 (3.3)</td><td align="left" valign="bottom">0-20</td><td align="left" valign="top">0.87</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">COLL_11</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.6 (3.2)</td><td align="left" valign="bottom">4-20</td><td align="left" valign="top">0.83</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">COLL_14<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">17.3 (1.6)</td><td align="left" valign="bottom">15-20</td><td align="left" valign="top">0.39</td><td align="left" valign="top">Review</td></tr><tr><td align="left" valign="top">COLL_16</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">14.7 (2.6)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.76</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">CT_8.1</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.0 (3.2)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.84</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">CT_9<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.1 (3.9)</td><td align="left" valign="bottom">5-19</td><td align="left" valign="top">0.85</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">CT_10</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.0 (3.5)</td><td align="left" valign="bottom">3-20</td><td align="left" valign="top">0.83</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">CT_13</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.9 (3.7)</td><td align="left" valign="bottom">4-20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">CT_16</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.1 (3.5)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">CT_27</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">14.4 (3.4)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.85</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">CT_28</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">14.2 (3.7)</td><td align="left" valign="bottom">1-20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">CT_29</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">12.4 (3.4)</td><td align="left" valign="bottom">0-19</td><td align="left" valign="top">0.84</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">EMP_3<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.3 (3.0)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.77</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">EMP_6</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.2 (4.5)</td><td align="left" valign="bottom">3-20</td><td align="left" valign="top">0.93</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">EMP_10</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.0 (3.4)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.80</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">EMP_14</td><td align="left" valign="top">Existing</td><td align="left" valign="top">12.3 (4.0)</td><td align="left" valign="bottom">4-20</td><td align="left" valign="top">0.87</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">EMP_15</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.0 (2.5)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.73</td><td align="left" valign="top">Review</td></tr><tr><td align="left" valign="top">EMP_17</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">13.8 (3.9)</td><td align="left" valign="bottom">2-20</td><td align="left" valign="top">0.90</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">EMP_18</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">15.7 (2.9)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.82</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">EMP_19<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">AI generated</td><td align="left" valign="top">13.2 (3.1)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.82</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ER_2</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.4 (3.2)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.86</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ER_3</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.7 (2.8)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.77</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ER_6.1</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.2 (4.0)</td><td align="left" valign="bottom">5-20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ER_16<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.7 (3.6)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.81</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ER_24<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.4 (2.6)</td><td align="left" valign="bottom">9-18</td><td align="left" valign="top">0.74</td><td align="left" valign="top">Review</td></tr><tr><td align="left" valign="top">ER_26</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">15.9 (2.7)</td><td align="left" valign="bottom">8-20</td><td align="left" valign="top">0.69</td><td align="left" valign="top">Review</td></tr><tr><td align="left" valign="top">ER_27</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">16.7 (2.5)</td><td align="left" valign="bottom">10-20</td><td align="left" valign="top">0.76</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">ER_28<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">AI generated</td><td align="left" valign="top">13.9 (3.5)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.86</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ER_29<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">AI generated</td><td align="left" valign="top">14.4 (3.5)</td><td align="left" valign="bottom">7-19</td><td align="left" valign="top">0.86</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">ER_30</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">14.4 (3.5)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.89</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">MOT_2</td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.8 (2.5)</td><td align="left" valign="bottom">6-19</td><td align="left" valign="top">0.66</td><td align="left" valign="top">Review</td></tr><tr><td align="left" valign="top">MOT_3</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.3 (3.5)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">MOT_4</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.5 (3.2)</td><td align="left" valign="bottom">5-20</td><td align="left" valign="top">0.84</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">MOT_6</td><td align="left" valign="top">Existing</td><td align="left" valign="top">17.0 (2.4)</td><td align="left" valign="bottom">11-20</td><td align="left" valign="top">0.76</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">MOT_9</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.2 (3.5)</td><td align="left" valign="bottom">6-20</td><td align="left" valign="top">0.87</td><td align="left" valign="top">Optimal</td></tr><tr><td align="left" valign="top">MOT_11<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Existing</td><td align="left" valign="top">14.6 (2.7)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.69</td><td align="left" valign="top">Review</td></tr><tr><td align="left" valign="top">MOT_12<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">AI generated</td><td align="left" valign="top">17.6 (2.1)</td><td align="left" valign="bottom">13-20</td><td align="left" valign="top">0.79</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">RES_1</td><td align="left" valign="top">Existing</td><td align="left" valign="top">15.4 (2.9)</td><td align="left" valign="bottom">9-20</td><td align="left" valign="top">0.84</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">RES_5</td><td align="left" valign="top">Existing</td><td align="left" valign="top">16.2 (2.9)</td><td align="left" valign="bottom">7-20</td><td align="left" valign="top">0.81</td><td align="left" valign="top">Acceptable</td></tr><tr><td align="left" valign="top">RES_6</td><td align="left" valign="top">AI generated</td><td align="left" valign="top">16.0 (3.0)</td><td align="left" valign="bottom">820</td><td align="left" valign="top">0.81</td><td align="left" valign="top">Acceptable</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Multiple mini interview stations were categorized into the following reliability categories, as outlined in <xref ref-type="table" rid="table1">Table 1</xref>: optimal (&#x03B1;&#x2265;0.85), acceptable (&#x03B1;=0.75-0.84), and review (&#x03B1;&#x003C;0.75).</p></fn><fn id="table2fn2"><p><sup>b</sup>Indicates stations with fewer than 50 candidate responses. Cronbach &#x03B1; estimates are generally less stable in small samples, and results for these stations should therefore be interpreted with caution.</p></fn><fn id="table2fn3"><p><sup>c</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Overall Reliability of AI-Generated and Traditional Stations</title><p>We conducted a Mann-Whitney <italic>U</italic> test to compare the internal consistency reliability of AI-generated and traditionally developed MMI stations. On average, the AI-generated stations demonstrated slightly higher Cronbach &#x03B1; values (mean &#x03B1;=0.82, SD 0.06) than the existing stations (mean &#x03B1;=0.81, SD 0.09). However, this difference was neither statistically significant (<italic>U</italic>=325; <italic>z</italic>=&#x2013;0.116; <italic>r</italic>=&#x2212;0.015; <italic>P</italic>=.91) nor was the difference large enough to be practically relevant (<xref ref-type="fig" rid="figure1">Figure 1</xref>). These findings suggest that while AI-generated stations may show marginally higher reliability, the overall level of internal consistency is comparable between the 2 station types.</p><p>In <xref ref-type="fig" rid="figure1">Figure 1</xref>, the mean Cronbach &#x03B1; values for AI-generated (17/56, 30%; mean &#x03B1;=0.82, SD 0.06) and traditional (39/56, 70%; mean &#x03B1;=0.81, SD 0.09) stations used in the 2025 Monash University medical school domestic admissions cycle. Bars represent mean (SD). <italic>P</italic> value was determined using a Mann-Whitney <italic>U</italic> test (<italic>U</italic>=325; <italic>z</italic>=&#x2212;0.116; <italic>r</italic>=&#x2212;0.015; <italic>P</italic>=.91).</p><p>When categorizing stations into review, acceptable, or optimal reliability groups (<xref ref-type="table" rid="table1">Table 1</xref>), AI-generated stations showed a slightly greater proportion achieving optimal reliability, with 41.2% (7/17; 95% CI 22%&#x2010;64%) of AI-generated stations meeting this threshold compared with 33.3% (13/39; 95% CI 21%&#x2010;49%) of the traditionally developed stations (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Additionally, the proportion of AI-generated stations in the review category was 5.9% (1/17; 95% CI 1%&#x2010;27%) and was lower compared with traditionally developed stations of which 12.8% (5/39; 95% CI 6%&#x2010;27%) were in the review category (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Most existing and AI-generated stations exhibited acceptable reliability, comprising 53.8% (21/39; 95% CI 39%&#x2010;68%) of traditionally developed stations and 52.9% (9/17; 95% CI 31%&#x2010;74%) of AI-generated stations (<xref ref-type="fig" rid="figure2">Figure 2</xref>). However, CIs were wide and substantially overlapping across categories, indicating that these proportional differences should be interpreted with caution.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Comparison of internal consistency reliability (Cronbach &#x03B1;) between artificial intelligence (AI)&#x2013;generated and traditionally developed multiple mini interview stations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e86208_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Distribution of reliability categories for artificial intelligence (AI)&#x2013;generated and existing multiple mini interview stations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e86208_fig02.png"/></fig><p>To assess the influence of small sample sizes on station categorization, we conducted a sensitivity analysis excluding stations with fewer than 50 participants. When restricted to stations with n&#x2265;50, there remained no significant difference in internal consistency between AI-generated and traditionally developed stations (<italic>P</italic>=.89). The distribution of reliability categories was similar between groups, with 41% (7/17) of AI-generated and 37% (33/39) of traditionally developed stations classified as optimal, 53% (9/17) and 54% (21/39) classified as acceptable, and 6% (1/17) and 13% (5/39) classified as review, respectively. These findings indicate that category-based differences observed in the full dataset are sensitive to the inclusion of small-N stations. All subsequent analyses were conducted using the full dataset unless otherwise specified.</p><p>In <xref ref-type="fig" rid="figure2">Figure 2</xref>, the proportion of stations classified as review, acceptable, and optimal based on Cronbach &#x03B1; is shown for AI-generated (17/56, 30%) and existing or traditionally developed stations (39/56, 70%). Percentages were calculated within each station type, relative to the total number of stations in each category. Error bars represent 95% Wilson CIs for the proportion of stations in each reliability category.</p><p>To address potential confounding factors and ensure that apparent differences were not due to unequal distribution of AI-generated stations across MMI domains, we conducted a linear regression analysis. This analysis evaluated the association between station type and Cronbach &#x03B1; while controlling for MMI domain, essentially asking whether AI-generated stations would still show different reliability if both station types were equally represented across all domains. Although Cronbach &#x03B1; is bounded between 0 and 1, observed values in this dataset were not concentrated near the boundaries (range 0.39&#x2010;0.93). Linear regression was therefore considered appropriate for estimating adjusted mean differences in &#x03B1;. The analysis showed that station type was not a significant predictor of reliability (&#x03B2;=&#x2212;0.003; <italic>P</italic>=.90), indicating that the slight advantage observed for AI-generated stations was not attributable to systematic differences in station development approaches. A beta regression model was also fitted as a sensitivity analysis; this yielded the same substantive inference (<italic>P</italic>=.77).</p><p>The regression analysis also examined whether some domains were inherently more challenging for developing reliable stations, which could have explained apparent station type differences if AI-generated stations were concentrated in naturally high-reliability domains. However, no significant differences in reliability were observed between domains (all <italic>P</italic>&#x003E;.10), indicating that critical thinking, empathy, advocacy, and other domains showed similar reliability levels regardless of station type.</p><p>Finally, we performed an interaction model to test whether AI-generated stations might have domain-specific strengths or weaknesses compared with traditional stations; for example, whether AI might excel at generating critical thinking stations but struggle with empathy scenarios. This analysis found no significant interactions (all <italic>P</italic>&#x003E;.48), demonstrating that the relationship between station type and reliability remained consistent across all MMI domains.</p><p>Collectively, these analyses demonstrate that while AI-generated stations showed slightly higher descriptive reliability metrics across multiple measures, these differences were not statistically significant. The findings suggest that AI-generated station development produces reliability outcomes equivalent to traditional faculty-led development methods, with performance remaining consistent across different MMI domains.</p></sec><sec id="s3-3"><title>Station Quality Analysis: Reliability and Discrimination Ability</title><p>Analysis of individual station performance revealed notable variation in both reliability and discrimination characteristics across AI-generated and traditionally developed stations. Several stations demonstrated exceptional reliability, with Cronbach &#x03B1; values exceeding 0.90 (<xref ref-type="table" rid="table2">Table 2</xref>). Notably, EMP_17 (&#x03B1;=0.90), an AI-generated station, and EMP_6 (&#x03B1;=0.93), a traditionally developed station, exhibited the highest internal consistency. Furthermore, both EMP_17 (mean 13.8, SD 3.9; range 2&#x2010;20) and EMP_6 (mean 14.2, SD 4.5; range 3&#x2010;20) stations demonstrated excellent discrimination capabilities. Other stations showing this desirable combination of high reliability and strong discrimination included ADV_18 (&#x03B1;=0.8925; mean 12.6, SD 4.1) and ER_6.1 (&#x03B1;=0.88; mean 14.2, SD 4.0). These stations may serve as exemplars for future MMI station development. Their structure was characterized by clear and focused prompts, as well as well-defined evaluation criteria, contributing to consistent assessment across different evaluators. The high reliability and discriminatory ability of these stations suggested they were effective in eliciting and measuring the targeted competencies in a robust and reproducible manner.</p><p>Conversely, some stations with acceptable reliability showed limited discrimination capability. For example, ADV_21 (&#x03B1;=.75; mean score 17.3, SD 2.3) and MOT_12 (&#x03B1;=.79; mean score 17.6, SD 2.1) exhibited ceiling effects, with most candidates scoring in the upper range of the scale, limiting their ability to distinguish between high-performing candidates (<xref ref-type="table" rid="table2">Table 2</xref>). While these stations demonstrated reasonable consistency, their restricted score distributions suggested they may be too easy or lack sufficient challenge to effectively discriminate between candidates of differing ability levels.</p><p>Notably, the traditionally developed station COLL_14 demonstrated poor performance across both metrics, with critically low reliability at a Cronbach &#x03B1; of 0.3925, well below the acceptable threshold, and poor discrimination (mean 17.3, SD 1.6; range 15&#x2010;20), with scores clustering near the maximum possible score (<xref ref-type="table" rid="table2">Table 2</xref>). Although this station had a limited sample size (18/824, 2%), the combination of poor reliability and severely restricted score range suggested fundamental issues with station design or implementation that warranted complete revision or replacement.</p><p>In comparison, the lowest-performing AI-generated station, ER_26, showed suboptimal reliability with a Cronbach &#x03B1; of 0.6876, but maintained reasonable discrimination (mean 15.8, SD 2.7; range 8&#x2010;20), suggesting that while this station requires refinement for internal consistency, it retains adequate discriminative function across the scoring range.</p><p>It should be noted that stations ADV_21, MOT_12, and COLL_14 had fewer than 50 candidate responses (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>), which can result in less stable reliability estimates. Thus, station-level Cronbach &#x03B1; values for these stations should be interpreted with caution. Nevertheless, taken together, these results demonstrate that both AI-generated and traditional development methods are capable of producing stations across the full spectrum of psychometric quality, from exemplary stations that excel in both reliability and discrimination to problematic stations requiring revision or replacement. The findings suggest that station effectiveness depends primarily on individual design characteristics and implementation rather than the development methodology used.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study compared the psychometric performance of AI-generated and traditionally developed MMI stations used in a high-stakes medical school admissions process. AI-generated stations demonstrated internal consistency reliability comparable to traditional stations, with no statistically significant differences in Cronbach &#x03B1;. A greater proportion of AI-generated stations were classified as optimal for internal consistency, while fewer fell within the review category. Both station types showed variable discrimination ability, indicating that overall station quality depended on individual design characteristics rather than development method. Taken together, these findings support psychometric equivalence between AI-generated and traditionally developed MMI stations rather than evidence of superiority.</p></sec><sec id="s4-2"><title>Implications for the Use of AI in Medical School Admissions</title><p>AI applications in medical education have shown promising results across various domains, including automated scoring of clinical examinations, natural language processing for medical curriculum development, and generation of multiple-choice questions for medical licensing examinations [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Similarly, AI has been increasingly integrated into admissions processes across professional programs, with applications ranging from automated essay scoring in standardized tests to predictive modeling of student success [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. This study extends this emerging body of work by demonstrating that AI can produce psychometrically sound assessment content for high-stakes medical school selection processes. Unlike much of the existing literature, which focuses on AI for prediction or automated scoring of applicant performance, this study examines the use of AI for content generation, specifically the development of MMI stations themselves.</p><p>Our results showed that AI-generated stations demonstrated comparable internal consistency to traditionally developed stations (mean &#x03B1;=0.82, SD 0.06 vs mean &#x03B1;=0.81, SD 0.09; <italic>P</italic>=.91). Of note, a greater proportion of AI-generated stations were classified as optimal (&#x03B1;&#x2265;0.85), while fewer were classified in the review category (&#x03B1;&#x003C;0.75) compared with traditional stations. However, this result is merely descriptive and should be interpreted with caution. The CIs around these proportions were wide and substantially overlapping, reflecting the small number of stations and indicating that the apparent differences are not statistically or practically robust. Nevertheless, these findings highlight the potential value of incorporating AI into admissions processes and provide a basis for examining how such tools might be further optimized and integrated into future selection frameworks.</p><p>Our findings align with broader research on AI-generated assessment content, which has consistently shown that the quality of AI-produced items depends more on human input in aspects including prompt engineering, content validation processes, and postgeneration refinement than on the AI system itself [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Studies comparing AI-generated and human-developed test items in educational contexts have similarly found equivalent psychometric properties when appropriate quality assurance measures are implemented [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. This suggests that the observed variability in both AI-generated and traditional MMI stations reflects fundamental challenges in assessment development rather than limitations specific to either approach.</p><p>Analysis of discrimination capability revealed that both AI-generated and traditionally developed stations demonstrated variable performance in their ability to distinguish between candidates of different competency levels. Some stations from each development method exhibited excellent combinations of high reliability and strong discrimination, including AI-generated station EMP_17 and traditionally developed station EMP_6. Conversely, other stations from both approaches showed acceptable reliability but limited discrimination due to ceiling effects, where most candidates scored in the upper range of the scale. This variability suggests that station effectiveness depended primarily on individual design characteristics and implementation quality rather than the development methodology used.</p><p>Taken together, these findings underscore that while the AI-generated MMI development process shows great promise, it should be viewed as a complementary tool within a broader, evidence-based framework for admissions assessment rather than a replacement for expert oversight and continual quality assurance. Our observation that both development methods can produce stations across the full spectrum of psychometric quality emphasizes that ongoing review, calibration, and refinement remain essential for ensuring that all stations, irrespective of their origin, meet the rigorous standards necessary for fair and effective candidate evaluation.</p><p>A notable limitation of this analysis was the small sample size for certain stations (n&#x003C;50), particularly for some of the new AI-generated stations. Smaller sample sizes can result in less stable reliability estimates, necessitating cautious interpretation of individual station-level Cronbach &#x03B1; values when analyzing the performance of specific stations. Although sensitivity analyses excluding small sample size (n&#x003C;50) stations attenuated category-based differences, future reliability assessments should aim for minimum sample sizes of n&#x2265;50 per station to ensure more robust and stable estimates.</p><p>Nevertheless, the implications of these findings are particularly significant when considered in the context of the overall MMI station development process, which has traditionally been time-consuming, resource intensive, and reliant on substantial faculty input. Crafting high-quality stations requires not only content expertise but also careful attention to fairness, clarity, and alignment with assessment objectives; factors that can introduce variability and potential bias. The fact that AI-generated stations performed comparably to, and in some cases more reliably and with better discrimination ability than, traditionally developed stations suggests that AI may offer a scalable and efficient alternative for generating high-quality assessment scenarios. By streamlining the development process and reducing reliance on individual judgment, AI has the potential to support both the efficiency and equity of MMI design, while preserving the psychometric robustness of the assessment. Future research should formally evaluate whether AI-generated stations are truly noninferior to traditional stations regarding reliability, validity, and fairness, ideally through prospective, multi-institutional studies with larger sample sizes.</p></sec><sec id="s4-3"><title>Recommendations for Using AI in Future MMI-Based Admissions Processes</title><p>In this study, we have identified several actionable strategies to use AI to achieve quality and consistency in MMI station development. Drawing on the observed reliability data from both AI-generated and traditionally developed stations, the following recommendations are intended to guide future improvements in MMI design and implementation.</p><p>The continued use of AI-generated station development is supported by the data. AI-generated stations demonstrated excellent internal consistency and discrimination capability, indicating that this approach can produce high-quality assessment tools. Sustaining and expanding the use of AI in MMI design may enhance the efficiency and reliability of future station development while maintaining psychometric quality.</p><p>To preserve the psychometric integrity of the MMI system over time, a formal process for routine monitoring of both reliability and discrimination ability should be established. This process should involve the regular calculation of Cronbach &#x03B1; alongside assessment of score distributions and discrimination metrics, particularly for newly introduced or modified stations. Systematic monitoring will enable the timely identification of underperforming stations and facilitate data-informed decisions regarding their revision or replacement.</p><p>We further recommend standardization in station design. High-performing stations (eg, those with &#x03B1;&#x003E;0.85 and high SDs) should be used as templates to guide the development of future scenarios. High-performing stations that demonstrate excellent reliability combined with strong discrimination capability (such as EMP_17 and EMP_6 in this study) should be used as templates to guide the development of future scenarios. Consistency in station structure, response criteria, and intended competencies is likely to improve fairness and reduce variability across MMI stations.</p><p>Stations that demonstrate below acceptable reliability (&#x03B1;&#x003C;0.70) or limited discrimination capability should be prioritized for immediate review. This includes stations with low Cronbach &#x03B1; values (such as COLL_14) as well as those exhibiting ceiling effects that limit their discriminatory power (such as ADV_21 and MOT_12). Where appropriate, these stations should be revised or removed from the selection process. Additionally, for stations with limited response data (n&#x003C;50), further data collection is necessary to obtain more stable reliability estimates and support evidence-based decision-making.</p><p>Finally, as the role of AI in MMI development expands, continual refinement with contemporary AI models will be essential. Improving the quality and contextual alignment of AI-generated content will require sustained collaboration between AI researchers and medical education professionals. This interdisciplinary approach will be critical to ensuring that AI tools not only support psychometric reliability but also align with the pedagogical and professional goals of medical school admissions.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In this paper, we provide, to our knowledge, the first empirical evaluation of AI-generated MMI stations implemented in a real-world medical school admissions process, offering evidence supporting the use of AI-assisted methods in MMI station development. AI-generated stations demonstrated comparable reliability and discrimination capability relative to traditionally, entirely human-developed stations, with a higher proportion achieving optimal internal consistency and fewer falling below acceptable thresholds. Given the time- and resource-intensive nature of traditional station development, our findings suggest that AI may offer a scalable, efficient, and psychometrically sound alternative.</p><p>However, the observed variability in reliability and discrimination ability across both AI-generated and traditional stations underscores the necessity of ongoing quality assurance. Regular psychometric evaluation and iterative refinement remain essential to maintaining the validity and fairness of the MMI process. Looking ahead, the integration of AI into medical school admissions may extend beyond station generation to include broader applications such as predictive modeling of applicant success. Nonetheless, any expansion of AI use must be guided by careful consideration of ethical concerns, including transparency, algorithmic bias, and equity, to ensure that admissions systems remain fair and inclusive.</p></sec></sec></body><back><ack><p>The authors thank the admissions and administrative teams involved in the delivery of the 2025 multiple mini interview process.</p></ack><notes><sec><title>Funding</title><p>No external funding was received for this study.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>ML, WH, and SB conceived and designed the study. KC provided advice on artificial intelligence (AI) tools and prompt engineering used in the development of AI-generated stations. KB collected and curated the dataset. KB, SH, and MNH conducted the data analyses. SH drafted the manuscript with input from all authors. All authors contributed to the interpretation of the findings, critically revised the manuscript, and approved the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">MMI</term><def><p>multiple mini interview</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayub</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yousuf</surname><given-names>N</given-names> </name><name name-style="western"><surname>Asad</surname><given-names>M</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>UA</given-names> </name></person-group><article-title>Multiple mini interviews as a measure of non-cognitive skills for admissions into undergraduate medical education programme in Pakistan: a validity study</article-title><source>J Pak Med Assoc</source><year>2017</year><month>12</month><volume>67</volume><issue>12</issue><fpage>1905</fpage><lpage>1909</lpage><pub-id pub-id-type="medline">29256539</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eva</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Reiter</surname><given-names>HI</given-names> </name><name name-style="western"><surname>Rosenfeld</surname><given-names>J</given-names> </name><name name-style="western"><surname>Norman</surname><given-names>GR</given-names> </name></person-group><article-title>The ability of the multiple mini-interview to predict preclerkship performance in medical school</article-title><source>Acad Med</source><year>2004</year><month>10</month><volume>79</volume><issue>10 Suppl</issue><fpage>S40</fpage><lpage>S42</lpage><pub-id pub-id-type="doi">10.1097/00001888-200410001-00012</pub-id><pub-id pub-id-type="medline">15383385</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eva</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Rosenfeld</surname><given-names>J</given-names> </name><name name-style="western"><surname>Reiter</surname><given-names>HI</given-names> </name><name name-style="western"><surname>Norman</surname><given-names>GR</given-names> </name></person-group><article-title>An admissions OSCE: the multiple mini-interview</article-title><source>Med Educ</source><year>2004</year><month>03</month><volume>38</volume><issue>3</issue><fpage>314</fpage><lpage>326</lpage><pub-id pub-id-type="doi">10.1046/j.1365-2923.2004.01776.x</pub-id><pub-id pub-id-type="medline">14996341</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rees</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Hawarden</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Dent</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hays</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bates</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hassell</surname><given-names>AB</given-names> </name></person-group><article-title>Evidence regarding the utility of multiple mini-interview (MMI) for selection to undergraduate health programs: a BEME systematic review: BEME guide no. 37</article-title><source>Med Teach</source><year>2016</year><month>05</month><volume>38</volume><issue>5</issue><fpage>443</fpage><lpage>455</lpage><pub-id pub-id-type="doi">10.3109/0142159X.2016.1158799</pub-id><pub-id pub-id-type="medline">27050026</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yusoff</surname><given-names>MS</given-names> </name></person-group><article-title>Multiple mini interview as an admission tool in higher education: insights from a systematic review</article-title><source>J Taibah Univ Med Sci</source><year>2019</year><month>06</month><volume>14</volume><issue>3</issue><fpage>203</fpage><lpage>240</lpage><pub-id pub-id-type="doi">10.1016/j.jtumed.2019.03.006</pub-id><pub-id pub-id-type="medline">31435411</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sadiq Hashmi</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Umair</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beg</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Huda</surname><given-names>N</given-names> </name></person-group><article-title>Multiple mini-interviews: current perspectives on utility and limitations</article-title><source>Adv Med Educ Pract</source><year>2019</year><volume>10</volume><fpage>1031</fpage><lpage>1038</lpage><pub-id pub-id-type="doi">10.2147/AMEP.S181332</pub-id><pub-id pub-id-type="medline">31849557</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gordon</surname><given-names>M</given-names> </name><name name-style="western"><surname>Daniel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ajiboye</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A scoping review of artificial intelligence in medical education: BEME guide no. 84</article-title><source>Med Teach</source><year>2024</year><month>04</month><volume>46</volume><issue>4</issue><fpage>446</fpage><lpage>470</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2024.2314198</pub-id><pub-id pub-id-type="medline">38423127</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keir</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Filippi</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Ellenbogen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Woldenberg</surname><given-names>R</given-names> </name></person-group><article-title>Using artificial intelligence in medical school admissions screening to decrease inter- and intra-observer variability</article-title><source>JAMIA Open</source><year>2023</year><volume>6</volume><issue>1</issue><fpage>ooad011</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooad011</pub-id><pub-id pub-id-type="medline">36819893</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>T</given-names> </name></person-group><article-title>Application of artificial intelligence generated content in medical examinations</article-title><source>Adv Med Educ Pract</source><year>2025</year><volume>16</volume><fpage>331</fpage><lpage>339</lpage><pub-id pub-id-type="doi">10.2147/AMEP.S492895</pub-id><pub-id pub-id-type="medline">40026780</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallegos</surname><given-names>IO</given-names> </name><name name-style="western"><surname>Rossi</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Barrow</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Bias and fairness in large language models: a survey</article-title><source>Comput Linguist</source><year>2024</year><month>09</month><volume>50</volume><issue>3</issue><fpage>1097</fpage><lpage>1179</lpage><pub-id pub-id-type="doi">10.1162/coli_a_00524</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cronbach</surname><given-names>LJ</given-names> </name><name name-style="western"><surname>Warrington</surname><given-names>WG</given-names> </name></person-group><article-title>Time-limit tests: estimating their reliability and degree of speeding</article-title><source>Psychometrika</source><year>1951</year><month>06</month><volume>16</volume><issue>2</issue><fpage>167</fpage><lpage>188</lpage><pub-id pub-id-type="doi">10.1007/BF02289113</pub-id><pub-id pub-id-type="medline">14844557</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tavakol</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dennick</surname><given-names>R</given-names> </name></person-group><article-title>Making sense of Cronbach&#x2019;s alpha</article-title><source>Int J Med Educ</source><year>2011</year><month>06</month><day>27</day><volume>2</volume><fpage>53</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.5116/ijme.4dfb.8dfd</pub-id><pub-id pub-id-type="medline">28029643</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Downing</surname><given-names>SM</given-names> </name></person-group><article-title>Reliability: on the reproducibility of assessment data</article-title><source>Med Educ</source><year>2004</year><month>09</month><volume>38</volume><issue>9</issue><fpage>1006</fpage><lpage>1012</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2929.2004.01932.x</pub-id><pub-id pub-id-type="medline">15327684</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Revelle</surname><given-names>W</given-names> </name></person-group><article-title>Psych: procedures for psychological, psychometric, and personality research</article-title><source>The Comprehensive R Archive Network</source><year>2025</year><access-date>2026-05-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=psych">https://CRAN.R-project.org/package=psych</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Narayanan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ramakrishnan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Durairaj</surname><given-names>E</given-names> </name><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name></person-group><article-title>Artificial intelligence revolutionizing the field of medical education</article-title><source>Cureus</source><year>2023</year><month>11</month><volume>15</volume><issue>11</issue><fpage>e49604</fpage><pub-id pub-id-type="doi">10.7759/cureus.49604</pub-id><pub-id pub-id-type="medline">38161821</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rinc&#x00F3;n</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Jimenez</surname><given-names>D</given-names> </name><name name-style="western"><surname>Aguilar</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Fl&#x00F3;rez</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Tapia</surname><given-names>&#x00C1;E</given-names> </name><name name-style="western"><surname>Pe&#x00F1;uela</surname><given-names>CL</given-names> </name></person-group><article-title>Mapping the use of artificial intelligence in medical education: a scoping review</article-title><source>BMC Med Educ</source><year>2025</year><month>04</month><day>12</day><volume>25</volume><issue>1</issue><fpage>526</fpage><pub-id pub-id-type="doi">10.1186/s12909-025-07089-8</pub-id><pub-id pub-id-type="medline">40221725</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lira</surname><given-names>B</given-names> </name><name name-style="western"><surname>Gardner</surname><given-names>M</given-names> </name><name name-style="western"><surname>Quirk</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Using artificial intelligence to assess personal qualities in college admissions</article-title><source>Sci Adv</source><year>2023</year><month>10</month><day>13</day><volume>9</volume><issue>41</issue><fpage>eadg9405</fpage><pub-id pub-id-type="doi">10.1126/sciadv.adg9405</pub-id><pub-id pub-id-type="medline">37824610</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Van Busum</surname><given-names>K</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>S</given-names> </name></person-group><article-title>Analysis of AI models for student admissions: a case study</article-title><source>SAC &#x2019;23: Proceedings of the 38th ACM/SIGAPP Symposium on Applied Computing</source><year>2023</year><publisher-name>Association for Computing Machinery</publisher-name><fpage>17</fpage><lpage>22</lpage></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nikolovski</surname><given-names>V</given-names> </name><name name-style="western"><surname>Trajanov</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chorbev</surname><given-names>I</given-names> </name></person-group><article-title>Advancing AI in higher education: a comparative study of large language model-based agents for exam question generation, improvement, and evaluation</article-title><source>Algorithms</source><year>2025</year><volume>18</volume><issue>3</issue><fpage>144</fpage><pub-id pub-id-type="doi">10.3390/a18030144</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhandari</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kwak</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pardos</surname><given-names>ZA</given-names> </name></person-group><article-title>Evaluating the psychometric properties of ChatGPT-generated questions</article-title><source>Comput Educ Artif Intell</source><year>2024</year><month>12</month><volume>7</volume><fpage>100284</fpage><pub-id pub-id-type="doi">10.1016/j.caeai.2024.100284</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O</surname><given-names>KM</given-names> </name></person-group><article-title>A comparative study of AI-human-made and human-made test forms for a university TESOL theory course</article-title><source>Lang Test Asia</source><year>2024</year><volume>14</volume><fpage>19</fpage><pub-id pub-id-type="doi">10.1186/s40468-024-00291-3</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Westacott</surname><given-names>R</given-names> </name><name name-style="western"><surname>Badger</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kluth</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gurnell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Reed</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Sam</surname><given-names>AH</given-names> </name></person-group><article-title>Automated Item Generation: impact of item variants on performance and standard setting</article-title><source>BMC Med Educ</source><year>2023</year><month>09</month><day>11</day><volume>23</volume><issue>1</issue><fpage>659</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04457-0</pub-id><pub-id pub-id-type="medline">37697275</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Example traditional and artificial intelligence&#x2013;generated multiple mini interview stations from the collaboration domain, including scenario prompts and structured questions.</p><media xlink:href="mededu_v12i1e86208_app1.docx" xlink:title="DOCX File, 160 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Detailed summary of multiple mini interview stations used in the 2025 Direct and Graduate Entry Medicine Program admissions cycle.</p><media xlink:href="mededu_v12i1e86208_app2.docx" xlink:title="DOCX File, 1680 KB"/></supplementary-material></app-group></back></article>