<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v12i1e75516</article-id><article-id pub-id-type="doi">10.2196/75516</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Comparing the Weighted Gain Score and a Rasch-Based Approach for Estimating Learning Outcomes in Medical Education: Quantitative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Aliyev</surname><given-names>Rauf</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Backhaus</surname><given-names>Joy</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hammer</surname><given-names>Silke</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>K&#x00F6;nig</surname><given-names>Sarah</given-names></name><degrees>MME, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Institute of Medical Teaching and Medical Education Research, University Hospital W&#x00FC;rzburg</institution><addr-line>Josef-Schneider-Str. 2/D6</addr-line><addr-line>W&#x00FC;rzburg</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Institute of Diagnostic and Interventional Radiology, University Hospital W&#x00FC;rzburg</institution><addr-line>W&#x00FC;rzburg</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Bahattab</surname><given-names>Awsan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Shafi</surname><given-names>Muhammad Saeed</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Alzaabi</surname><given-names>Shaikha</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Valencia-Perez</surname><given-names>T A</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sarah K&#x00F6;nig, MME, MD, Institute of Medical Teaching and Medical Education Research, University Hospital W&#x00FC;rzburg, Josef-Schneider-Str. 2/D6, W&#x00FC;rzburg, 97080, Germany, +49 931 201 55210, +49 931 201 655213; <email>Koenig_Sarah@ukw.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>16</day><month>6</month><year>2026</year></pub-date><volume>12</volume><elocation-id>e75516</elocation-id><history><date date-type="received"><day>18</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>19</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>22</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Rauf Aliyev, Joy Backhaus, Silke Hammer, Sarah K&#x00F6;nig. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 16.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2026/1/e75516"/><abstract><sec><title>Background</title><p>Pretest-posttest designs are widely used to estimate learning gain in studies evaluating educational interventions in medical education. The Weighted Gain Score (WGS) was proposed to reduce bias associated with differences in baseline performance.</p></sec><sec><title>Objective</title><p>This study evaluated the statistical and inferential properties of the WGS by comparing it to Rasch Learning Gain (RLG) across 3 datasets.</p></sec><sec sec-type="methods"><title>Methods</title><p>The WGS implements a weighting coefficient that includes the parameter &#x00B5;, which linearly rescales the difference between pretest and posttest percentage scores. We examined the effect of varying &#x00B5; (30, 50, and 70) on learning gain calculations and compared the results with those obtained using RLG. The following three datasets were analyzed: (1) a small illustrative dataset demonstrating the mathematical behavior of the WGS, (2) an empirical dataset from a previous educational evaluation study, and (3) a randomly generated binomial dataset designed to examine the metric under larger sample conditions.</p></sec><sec sec-type="results"><title>Results</title><p>Changing the parameter &#x00B5; in the WGS affected the magnitude of the calculated learning gains: lower &#x00B5;-values produced larger gain estimates, whereas higher &#x00B5;-values produced smaller estimates. Despite these differences in scale, the WGS and RLG correlated strongly in both the empirical dataset (<italic>r</italic>=0.93; <italic>P</italic>&#x003C;.001) and the simulated dataset (<italic>r</italic>=0.92; <italic>P</italic>&#x003C;.001); variation in &#x00B5; did not alter the inferential results. Both methods identified the same interaction effect in the empirical dataset.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The WGS produced results highly consistent with those of RLG while requiring substantially lower computational complexity. The metric can be applied to both small and large datasets and allows &#x00B5; to function as an adjustment coefficient for calibrating learning gain estimates across cohorts without altering inferential conclusions.</p></sec></abstract><kwd-group><kwd>medical education</kwd><kwd>teaching quality</kwd><kwd>curriculum evaluation</kwd><kwd>learning gain</kwd><kwd>pretest-posttest design</kwd><kwd>Rasch model</kwd><kwd>Weighted Gain Score</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Teaching quality in medical education is a complex construct encompassing curriculum design, instructional methods, teaching expertise, learner engagement, and assessment practices [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. High-quality teaching in this context contributes to the development of competent physicians and thereby influences the quality of patient care [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Among the various aspects of teaching quality in medical education, student learning outcomes represent one measurable indicator frequently used in program evaluation and educational research [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, interpreting learning outcomes as indicators of teaching effectiveness requires caution, as they are influenced by multiple factors beyond instructional quality. These include student motivation, prior knowledge, learning strategies, teacher enthusiasm, and learning activities occurring outside the formal curriculum [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. To account for these influences, educational research often focuses not only on absolute performance but also on changes in performance over time. The concept of learning gain represents a widely used approach to capturing students&#x2019; learning progress. In educational research, learning gain is commonly operationalized by assessing students before (pretest) and after (posttest) an educational intervention. The difference between pretest and posttest scores is then interpreted as an indicator of learning gain attributable, at least in part, to the educational intervention [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. However, calculating learning gain is not trivial, as simple difference scores may lead to biased estimates depending on students&#x2019; baseline knowledge. One simple approach is raw gain, which is calculated as the arithmetic difference between posttest and pretest scores. However, raw gain scores exhibit a negative correlation with baseline performance (ie, pretest scores) and are also affected by ceiling effects, meaning that students with lower pretest scores may appear to exhibit larger gains simply because they have more room for improvement [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>To address these limitations, several modified gain metrics have been proposed. One widely used approach is the normalized gain introduced by Hake [<xref ref-type="bibr" rid="ref20">20</xref>], which expresses the observed pretest-posttest gain relative to the maximum possible gain. Although this metric has been applied extensively in educational research [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], it also has important methodological limitations. It remains dependent on baseline performance, may inflate gains for students with high pretest scores, and behaves inconsistently when posttest scores fall below pretest scores or when pretest scores approach the maximum value [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Taken together, existing gain metrics may distort estimates of learning gain, particularly in cohorts with heterogeneous baseline knowledge. Many of these metrics either remain strongly dependent on baseline performance or require complex psychometric modeling. This highlights the need for approaches that provide statistically robust yet practically applicable estimates of learning gain in educational evaluation.</p><p>A recently proposed metric developed by our workgroup, the &#x201C;Weighted Gain Score&#x201D; (WGS), aims to address these limitations by applying a weighting coefficient that adjusts gain calculations according to students&#x2019; baseline performance [<xref ref-type="bibr" rid="ref16">16</xref>]. However, the statistical and inferential properties of this metric have not yet been systematically investigated. To address this gap, we evaluated the WGS by comparing it with Rasch Learning Gain (RLG), a Rasch model&#x2013;based approach for estimating learning gain that served as the benchmark in our study [<xref ref-type="bibr" rid="ref24">24</xref>]. Specifically, we addressed the following research questions:</p><list list-type="bullet"><list-item><p>Does the WGS produce inferential results comparable to those produced by RLG?</p></list-item><list-item><p>Can the parameter &#x00B5; in the WGS be adjusted for different cohorts to calibrate learning gain calculations without altering inferential conclusions?</p></list-item></list><p>Through this analysis, we aimed to clarify the statistical behavior of the WGS and explore its potential applicability for the evaluation of educational interventions.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Metric WGS</title><p>The mathematical foundation of the WGS lies in the use of the weighting coefficient &#x201C;pre/&#x00B5;,&#x201D; which linearly transforms the difference between pretest and posttest percentage scores (denoted as &#x201C;pre&#x201D; and &#x201C;post&#x201D; in equation 1), thereby adjusting for pretest variability [<xref ref-type="bibr" rid="ref16">16</xref>]. Formally, the WGS is defined as:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>W</mml:mi><mml:mi>G</mml:mi><mml:mi>S</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>&#x03BC;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>To illustrate the computation, consider a hypothetical student with a pretest score of 40% and a posttest score of 70%.</p><p>For &#x00B5;=50, the WGS is calculated as: WGS = (70 &#x2013; 40) &#x00D7; (40/50) = 30 &#x00D7; 0.8 = 24.</p><p>If &#x00B5; is increased to 70, the same performance yields: WGS = (70 &#x2013; 40) &#x00D7; (40/70) = 30 &#x00D7; 0.57 = 17.14.</p><p>This example illustrates that increasing &#x00B5; reduces the magnitude of the calculated gain while preserving the relative ordering of observations. When posttest scores fall below pretest scores, the WGS assumes negative values, indicating a decrease in performance.</p><p>Originally, the parameter &#x00B5; used in the weighting coefficient was defined as the average pretest score of a cohort. It was constrained to integer values between 1 and 100, consistent with the percentage format of &#x201C;pre&#x201D; and &#x201C;post.&#x201D; In the original formulation, its value was set at 50 as a default reference value [<xref ref-type="bibr" rid="ref16">16</xref>]. In this study, &#x00B5; is interpreted as an adjustment coefficient that functions as a scaling parameter for learning gain calculations. Changing its value proportionally rescales the calculated gain scores: higher values of &#x00B5; lead to smaller gain estimates, whereas lower values produce larger gain estimates. Importantly, this modification represents a linear transformation of the calculated values and therefore does not alter the underlying statistical relationships among observations.</p><p>To examine the influence of this parameter on the stability of the WGS, we tested 3 calibration levels in our datasets: &#x00B5;=30, &#x00B5;=50, and &#x00B5;=70. These values represent 3 nonextreme points within the possible range of 1 to 100, allowing us to evaluate the behavior of the WGS across low, moderate, and high scaling conditions.</p></sec><sec id="s2-2"><title>Rasch Model and RLG</title><p>The Rasch model is a fundamental concept in modern psychometric measurement. The probability that a student answers a specific item correctly depends on 2 key factors: the student&#x2019;s ability and the difficulty of the item. In the Rasch framework, a student&#x2019;s latent ability is denoted by &#x03B8;, whereas item difficulty is represented by &#x03B2;. When a student&#x2019;s ability exceeds the difficulty of an item, the probability of answering correctly increases, and vice versa [<xref ref-type="bibr" rid="ref25">25</xref>]. Because the Rasch model allows the estimation of individual students&#x2019; abilities independently of the specific test items used, it is widely applied in educational measurement and medical education research [<xref ref-type="bibr" rid="ref26">26</xref>]. With this in mind, we selected RLG as a reference method for evaluating the WGS.</p><p>We applied the dichotomous 1-parameter logistic Rasch model. Item parameters were estimated using conditional maximum likelihood estimation. On the basis of the fitted model, person abilities were subsequently calculated using maximum likelihood estimation separately for the pretest (&#x03B8;<sub>pre</sub>) and posttest (&#x03B8;<sub>post</sub>) data [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>As indicated in equation 2, RLG was defined as the difference between the estimated posttest and pretest abilities. This difference represents the change in latent ability on the Rasch measurement scale and serves as an estimate of individual learning gain across the instructional intervention [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>R</mml:mi><mml:mi>L</mml:mi><mml:mi>G</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>To ensure the validity of Rasch-based ability estimates, we examined global model fit indicators. Item infit and outfit statistics ranged between 0.7 and 1.3, which is generally considered acceptable for the Rasch model. In addition, person reliability exceeded 0.8, and separation indices were &#x003E;2, indicating satisfactory measurement precision.</p></sec><sec id="s2-3"><title>Datasets</title><p>Three datasets were used to examine the behavior of the WGS under different analytical conditions:</p><list list-type="order"><list-item><p>The illustrative dataset (n=10): a small artificial dataset designed to illustrate the mathematical behavior of the parameter &#x00B5; within the WGS metric</p></list-item><list-item><p>The empirical dataset (n=170): a dataset consisting of real-world data derived from a previously published educational evaluation study [<xref ref-type="bibr" rid="ref29">29</xref>], used to examine the behavior of the WGS under authentic educational conditions and to perform inferential statistical analyses</p></list-item><list-item><p>The simulated dataset (n=1000): a randomly generated binomial dataset designed to mirror the structure of the empirical dataset while providing a larger sample size, allowing the behavior of the parameter &#x00B5; to be examined independently of the empirical data</p></list-item></list></sec><sec id="s2-4"><title>The Illustrative Dataset</title><p>Following the design of the simulated dataset in our previous study [<xref ref-type="bibr" rid="ref16">16</xref>], we created an artificial dataset by combining different pretest scores with varying levels of raw gain in test performance, defined as the absolute difference between posttest and pretest scores. Pretest scores ranged from 1 to 10 points, and the gain in performance was simulated by increasing test scores by 1 to 4 points. To avoid potential ceiling effects, the analysis included only combinations in which the sum of pretest scores and the simulated gains did not exceed the maximum of 10 points. The sample size of the illustrative dataset was set at 10. RLG was not applicable here, as Rasch model&#x2013;based estimation requires larger sample sizes to obtain stable parameter estimates [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s2-5"><title>The Empirical Dataset</title><p>The empirical dataset originated from a prospective educational study conducted at the University Medical Center G&#x00F6;ttingen in G&#x00F6;ttingen, Germany [<xref ref-type="bibr" rid="ref29">29</xref>]. The study compared the learning gain of students attending a traditional lecture on goiter with that of students using a corresponding video podcast (vodcast) within the teaching module &#x201C;Operative Medicine.&#x201D; The study was conducted over 2 consecutive semesters using a pretest-posttest design based on 9 multiple-choice test items. A total of 170 students participated. Students were additionally surveyed regarding their learning dispositions, which resulted in the classification of participants into 2 groups: &#x201C;traditional learners&#x201D; and &#x201C;digital natives.&#x201D; A total of 35 students (20.59%) could not be clearly assigned to either group and were therefore excluded from group-based analyses. Consequently, 135 (79.41%) students were included in the 2-way ANOVA examining the interaction between teaching format and learning disposition (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of students according to teaching format and learning disposition in the empirical and simulated datasets.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Datasets and teaching formats</td><td align="left" valign="bottom">Traditional learners, n (%)</td><td align="left" valign="bottom">Digital natives, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Empirical dataset (N=135)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lecture</td><td align="left" valign="top">38 (28.15)</td><td align="left" valign="top">34 (25.19)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vodcast</td><td align="left" valign="top">28 (20.74)</td><td align="left" valign="top">35 (25.93)</td></tr><tr><td align="left" valign="top" colspan="3">Simulated dataset (N=1000)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lecture</td><td align="left" valign="top">259 (25.9)</td><td align="left" valign="top">210 (21)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vodcast</td><td align="left" valign="top">250 (25)</td><td align="left" valign="top">281 (28.1)</td></tr></tbody></table></table-wrap></sec><sec id="s2-6"><title>The Simulated Dataset</title><p>The simulated dataset was generated using a random binomial distribution, assuming a 50% probability of correctly answering a hypothetical examination item. This probability was applied to 9 multiple-choice items in both the pretest and the posttest scores, reflecting the structure of the &#x201C;empirical dataset.&#x201D; Apart from the larger sample size, the primary difference between the empirical and simulated datasets was the random allocation of group variables. Two variables were simulated: teaching format and learning disposition. Both variables were coded dichotomously. For consistency in labeling, the simulated variables were named analogously to those in the empirical dataset, although they represent random group assignments rather than actual instructional formats or learning characteristics. Each simulated student had a 50% probability of being assigned to each category (<xref ref-type="table" rid="table1">Table 1</xref>). The sample size for the simulated dataset was 1000.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>All simulations and statistical analyses were conducted using the R software suite (version 4.1.2; R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref31">31</xref>]. Rasch modeling was performed using the <italic>eRm</italic> package [<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>To examine the relationship between the 2 learning gain metrics, Pearson correlation coefficients were calculated between the WGS and RLG scores.</p><p>To investigate potential interaction effects between teaching format and learning disposition, we conducted a 2-way ANOVA. Post hoc comparisons were performed using Bonferroni-adjusted contrasts. Effect sizes were reported as partial &#x03B7;&#x00B2;, and 95% CIs were calculated where appropriate.</p><p>Normality of the dependent variables was assessed using the Shapiro-Wilk test and visual inspection of Q-Q plots. Minor deviations from normality were observed, which are common in bounded percentage scores (0% to 100%) frequently used in educational assessments. Given the present sample sizes and the absence of influential outliers, ANOVA was considered sufficiently robust to moderate violations of the normality assumption.</p><p>Homogeneity of variances across groups was evaluated using the Levene test and the Brown-Forsythe test, both of which indicated no statistically significant differences in variance between groups. All statistical tests were 2-sided, and a significance level of <italic>P</italic>&#x003C;.05 was applied.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>The empirical data analyzed within this work were reviewed and judged by the local institutional review and ethics board (Medical Ethics Committee, University Medical Center G&#x00F6;ttingen) as not representing medical or epidemiological research on human participants and, therefore, were assessed using a simplified assessment protocol. The project was approved without any reservation under proposal number 1-11-14.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Effect of the Parameter &#x00B5; on WGS Learning Gain Estimates</title><p>The illustrative dataset demonstrates the mathematical effect of varying &#x00B5; (30, 50, and 70) on the WGS. Changes in &#x00B5; systematically altered the slope of the WGS learning gain plots (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Each subplot represents a different raw gain scenario, ranging from 1 to 4 points. As the &#x00B5;-value increased, the slope of the learning gain curve decreased, resulting in smaller WGS values for the same pretest score. For example, with a gain of 1 point and a pretest score of 6, the WGS was approximately 20% for &#x00B5;=30 and decreased to &#x003C;10% for &#x00B5;=70. This pattern remained consistent across all 4 gain scenarios, illustrating that increasing &#x00B5; reduces the magnitude of the calculated learning gain while preserving the relative ordering of observations.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Effect of varying &#x00B5; (30, 50, and 70) on Weighted Gain Score (WGS) learning gain estimates in the illustrative dataset. (A) Gain of 1 point, (B) gain of 2 points, (C) gain of 3 points, and (D) gain of 4 points.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75516_fig01.png"/></fig></sec><sec id="s3-2"><title>Correlation Analysis Between WGS and RLG</title><p>The WGS demonstrated a strong positive correlation with RLG across all tested &#x00B5;-values (<xref ref-type="fig" rid="figure2">Figure 2</xref>). In the empirical dataset, the Pearson correlation coefficient was consistently high (<italic>r</italic>=0.93; <italic>P</italic>&#x003C;.001). A similarly strong relationship was observed in the simulated dataset (<italic>r</italic>=0.92; <italic>P</italic>&#x003C;.001). The correlation coefficients remained identical across the tested &#x00B5;-values (30, 50, and 70) in both datasets.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Correlation between Weighted Gain Score (WGS) and Rasch Learning Gain (RLG) in the empirical and simulated datasets. (A) Empirical dataset with &#x00B5;=30, (B) empirical dataset with &#x00B5;=50, (C) empirical dataset with &#x00B5;=70, (D) simulated dataset with &#x00B5;=30, (E) simulated dataset with &#x00B5;=50, and (F) simulated dataset with &#x00B5;=70.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75516_fig02.png"/></fig></sec><sec id="s3-3"><title>Analysis of Interaction Effects Using WGS and RLG</title><p>All 3 calibrations of the WGS (&#x00B5;=30, &#x00B5;=50, and &#x00B5;=70) detected a significant interaction effect between teaching format and learning disposition in the empirical dataset (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Traditional learners displayed higher learning gains in the lecture format than digital natives (<italic>F</italic><sub>1,131</sub>=6.51; <italic>P</italic>=.01; partial &#x03B7;&#x00B2;=0.05). For &#x00B5;=50, the mean difference was &#x2212;11.64 (95% Bonferroni-adjusted CI &#x2212;21.46 to &#x2212;1.83; <italic>P</italic>=.01). Corresponding estimates were &#x2212;19.41 for &#x00B5;=30 (95% Bonferroni-adjusted CI &#x2212;35.80 to &#x2212;3.04; <italic>P</italic>=.01) and &#x2212;8.32 for &#x00B5;=70 (95% Bonferroni-adjusted CI &#x2212;15.33 to &#x2212;1.31; <italic>P</italic>=.01).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Learning gain estimates calculated using Weighted Gain Score (WGS) and Rasch Learning Gain (RLG), depicting the interaction between teaching format and learning disposition in the empirical and simulated datasets. (A) Empirical dataset with WGS (&#x00B5;=30), (B) empirical dataset with WGS (&#x00B5;=50), (C) empirical dataset with WGS (&#x00B5;=70), (D) empirical dataset with RLG, (E) simulated dataset with WGS (&#x00B5;=30), (F) simulated dataset with WGS (&#x00B5;=50), (G) simulated dataset with WGS (&#x00B5;=70), and (H) simulated dataset with RLG. **Indicates statistical significance at <italic>P</italic>=.01.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v12i1e75516_fig03.png"/></fig><p>RLG also detected this interaction effect (<italic>F</italic><sub>1,131</sub>=6.75; <italic>P</italic>=.01; partial &#x03B7;&#x00B2;=0.05) with a mean difference of &#x2212;19.91 (95% Bonferroni-adjusted CI &#x2212;36.80 to &#x2212;3.05; <italic>P</italic>=.01), confirming the interaction pattern observed in the original study from which our empirical dataset was derived [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>In the simulated dataset, no significant interaction between teaching format and learning disposition was observed when learning gains were calculated using the WGS, regardless of the &#x00B5;-value applied (<italic>F</italic><sub>1,996</sub>=0.39; <italic>P</italic>=.53; partial &#x03B7;&#x00B2;&#x003C;0.001; <xref ref-type="fig" rid="figure3">Figure 3</xref>). Similarly, RLG did not reveal any significant difference in performance between the groups (<italic>F</italic><sub>1,996</sub>=1.10; <italic>P</italic>=.29; partial &#x03B7;&#x00B2;=0.001). Because teaching format and learning disposition were randomly assigned in the simulated dataset, we did not necessarily expect any interaction effect.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Inferential Behavior of WGS Compared With RLG</title><p>A robust method for calculating learning gain is essential for capturing students&#x2019; learning progress following an educational intervention and for providing interpretable indicators of educational effectiveness. Such a method should be statistically sound, transparent, and practically applicable within evaluation processes.</p><p>This study evaluated the statistical behavior of the WGS, a method designed to estimate learning gain in a way that is both methodologically robust and straightforward to implement. The first research question examined whether the WGS yields inferential results comparable to those obtained with RLG. Our findings demonstrated a strong inferential correspondence between the 2 methods. The WGS produced learning gain estimates that correlated highly with those derived from RLG, while also identifying the same interaction effect in the empirical dataset as the Rasch model&#x2013;based approach. Importantly, these inferential conclusions remained stable across all tested &#x00B5;-values (30, 50, and 70). The identical correlation coefficients between the WGS and RLG and the unchanged ANOVA results indicate that modifying the parameter &#x00B5; linearly rescales learning gain estimates. Consequently, varying &#x00B5; changes the magnitude of WGS values but does not affect statistical inference.</p></sec><sec id="s4-2"><title>Robustness of WGS Under Nonnormally Distributed Data</title><p>Neither the empirical nor the simulated dataset fully satisfied the assumption of normality, although no substantial skewness was observed. In medical education research, deviations from normality are common, particularly in pretest-posttest designs [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. A ceiling effect occurs when pretest scores approach the maximum possible value, limiting the measurable improvement, whereas a floor effect arises when pretest performance is concentrated near the minimum score in a difficult test. Very easy items tend to produce ceiling effects, whereas very difficult items may lead to floor effects. Despite deviations from normality, the WGS demonstrated stable inferential behavior across the empirical and simulated datasets, suggesting a degree of robustness. This finding is consistent with previous research indicating that parametric methods such as ANOVA and correlation analyses are generally robust to moderate violations of normality, particularly in samples of the size examined in this study [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. Nevertheless, future research is needed to examine the behavior of the WGS across a broader range of distributional scenarios to better establish its reliability.</p></sec><sec id="s4-3"><title>Applicability of WGS in Small Samples</title><p>The illustrative dataset demonstrates that the WGS yields interpretable results even with very small sample sizes. In contrast, Rasch model&#x2013;based approaches typically require substantially larger samples to ensure stable estimation of item parameters and person abilities [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. This distinction is particularly relevant in educational settings with small cohorts, such as specialized teaching modules, pilot courses, or resource-intensive instructional interventions. In such contexts, the WGS may represent a practical alternative method for estimating learning gain because it does not rely on complex parameter estimation.</p><p>More broadly, transparent feedback on learning outcomes supports the continuous development of teaching practices, as evidence suggests that feedback on educational performance encourages educators to engage in reflective improvement of their teaching [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>].</p></sec><sec id="s4-4"><title>The Role of &#x00B5; as an Adjustment Coefficient</title><p>The second research question examined whether the parameter &#x00B5; in the WGS can be adjusted across different cohorts to calibrate learning gain calculations without altering inferential outcomes. In the original study introducing the WGS, &#x00B5; was defined as the average pretest score of a cohort. Our findings suggest that the role of &#x00B5; can be understood more broadly. Rather than representing solely the cohort mean, &#x00B5; functions as a scaling parameter that allows calibration of the learning gain metric. To reflect this role more accurately, we interpret &#x00B5; in this study as an adjustment coefficient that can be modified depending on the analytical purpose of the evaluation. On the basis of the results of this study, 3 conceptual adjustment strategies can be distinguished: absolute adjustment, relative adjustment, and routine evaluation. A decision framework for selecting &#x00B5; is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s4-5"><title>Absolute Adjustment: Monitoring of Cohort Learning</title><p>Absolute adjustment refers to the use of a fixed &#x00B5;-value to estimate learning gain within a stable scaling framework. When &#x00B5; remains constant, differences in learning gain across courses, time points within the curriculum, or different cohorts can be interpreted without recalibration of the metric, thereby ensuring cross-cohort comparability. This approach supports standardized monitoring of educational outcomes, for example, when evaluating curricular developments over time or comparing modules within a program. Observed differences in learning gain may arise from multiple factors, including instructional design, assessment characteristics, or cohort composition. Maintaining a fixed &#x00B5; ensures that such differences remain visible and can be attributed to substantive factors.</p></sec><sec id="s4-6"><title>Relative Adjustment: Evaluation of Teaching Interventions</title><p>Relative adjustment enables comparison of teaching interventions across cohorts with heterogeneous characteristics. In educational practice, cohorts often differ in characteristics such as demographics, motivation, workload, or external contextual influences [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. When learning gain is used to compare instructional formats, such heterogeneity may affect the interpretation of outcomes. Under a relative adjustment strategy, a &#x00B5;-value may be calibrated separately for each cohort, allowing the scaling of learning gain calculations to reflect cohort-specific baseline conditions. Although this approach does not eliminate potential confounding factors, it may reduce systematic bias associated with heterogeneous starting conditions. This strategy is particularly useful when learning gain is evaluated without strict requirements for cross-cohort comparability, but with a focus on fair comparison of teaching interventions within specific cohorts or instructional contexts.</p></sec><sec id="s4-7"><title>Routine Evaluation: Selecting &#x00B5; in Practice</title><p>In routine applications, when learning gain is estimated without strict requirements for cross-cohort comparability or cohort-specific calibration, &#x00B5; may be selected pragmatically based on the cohort&#x2019;s mean pretest performance. For example, cohorts with mean pretest scores approximately 50% of the maximum achievable score may be assigned &#x00B5;=50, whereas cohorts with substantially higher or lower baseline knowledge may be assigned correspondingly higher (eg, &#x2265;70) or lower (eg, &#x2264;30) &#x00B5;-values. This pragmatic approach enables straightforward estimation of learning gain while preserving a transparent and easily interpretable scaling of the WGS metric.</p></sec><sec id="s4-8"><title>Limitations</title><p>One limitation of the WGS arises when a student obtains a pretest score of zero, which results in a calculated learning gain of zero regardless of posttest performance. In practice, such cases are unlikely in multiple-choice assessments because guessing and prior knowledge increase the probability of obtaining at least 1 correct response [<xref ref-type="bibr" rid="ref41">41</xref>]. In the empirical dataset, no student recorded a pretest score of zero, and in the simulated dataset, a negligible number (3 out of 1000 students) achieved zero points on the pretest score. One possible strategy is to exclude such observations from the analysis. However, this may reduce statistical power and introduce bias if students with low baseline scores are systematically underrepresented. Alternatively, a small positive offset (pseudocount) could be added to avoid undefined computations, analogous to continuity corrections used in categorical data analysis [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. The implications of such adjustments should be examined in future methodological studies, for example, through sensitivity analyses comparing different handling strategies for zero-baseline observations [<xref ref-type="bibr" rid="ref44">44</xref>].</p><p>A further limitation concerns the sample size of the empirical dataset (n=170). Although cohort sizes of this magnitude are common in single-semester cohorts at German medical faculties, they are slightly below commonly cited recommendations for stable Rasch parameter estimation, which often suggest sample sizes of approximately 150 to 200 participants or more [<xref ref-type="bibr" rid="ref45">45</xref>]. Nevertheless, global model fit indicators in the empirical dataset (infit and outfit statistics, person reliability, and separation indices) were within acceptable ranges, supporting the interpretability of the RLG-based estimates despite the moderate sample size.</p><p>Another limitation relates to the test length used in the simulated dataset, which consisted of 9 multiple-choice items to mirror the empirical dataset. Because measurement reliability generally increases with test length [<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref50">50</xref>], the limited number of items may reduce measurement precision and restrict the generalizability of the findings. Therefore, future research should examine the performance of the WGS in assessments with larger item sets that more closely reflect the scope of medical examinations.</p><p>Finally, both datasets exhibited deviations from normality<bold>,</bold> although homogeneity of variances across groups was supported by the Levene and Brown-Forsythe tests, and no influential outliers were observed. Previous methodological research indicates that ANOVA and Pearson correlation are generally robust to moderate violations of normality, particularly in samples of the present size [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. Therefore, we consider the impact of nonnormality on the inferential conclusions to be limited.</p></sec><sec id="s4-9"><title>Conclusions and Future Research</title><p>This study evaluated the WGS as a method for estimating learning gain in pretest-posttest educational designs. Our findings indicate that the WGS provides robust and easily interpretable estimates while remaining computationally simple. Rather than replacing established psychometric models, the WGS may complement existing approaches, particularly in routine educational evaluations.</p><p>Future research should further develop the WGS as a broadly applicable evaluation instrument. In particular, establishing a methodologically sound calibration framework for &#x00B5; will be essential, including empirically grounded decision models that guide &#x00B5;-selection according to the evaluation purpose, such as cohort monitoring or comparative evaluation of teaching interventions. In addition, integrating the WGS into structured program evaluations, including longitudinal monitoring across courses, will be important for assessing its generalizability across educational contexts.</p><p>Future work may also explore the integration of the WGS within Bayesian test-theoretical frameworks [<xref ref-type="bibr" rid="ref51">51</xref>]. By incorporating prior information and updating gain estimates as new data become available, Bayesian approaches could further improve the precision and contextual sensitivity of WGS-based learning gain estimates. Further studies should also examine the behavior of the WGS under different distributional conditions to better establish its robustness.</p></sec></sec></body><back><ack><p>The authors sincerely thank Simone Kann and Michael Schuler for their valuable insights and thoughtful suggestions, as well as Andrew Entwistle for his contribution to the revision of this manuscript.</p></ack><notes><sec><title>Funding</title><p>This research did not receive funding from any specific grant provided by public, commercial, or not-for-profit agencies.</p></sec><sec><title>Data Availability</title><p>The data supporting the findings of this study are provided as a multimedia appendix to facilitate full reproducibility.</p></sec></notes><fn-group><fn fn-type="con"><p>All authors were involved in the conception and/or design of the study and contributed critically to the final preparation of this study, including approving the final version of the manuscript. In particular, SK conceived and designed the study, wrote the final study protocol, and drafted the manuscript. RA conducted the study, collected the results, and analyzed the data. SH and JB analyzed the data and performed and verified the statistical analyses.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">WGS</term><def><p>Weighted Gain Score</p></def></def-item><def-item><term id="abb2">RLG</term><def><p>Rasch Learning Gain</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Charalambous</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Praetorius</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Sammons</surname><given-names>P</given-names> </name><name name-style="western"><surname>Walkowiak</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jentsch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kyriakides</surname><given-names>L</given-names> </name></person-group><article-title>Working more collaboratively to better understand teaching and its quality: challenges faced and possible solutions</article-title><source>Stud Educ Eval</source><year>2021</year><month>12</month><volume>71</volume><issue>1</issue><fpage>101092</fpage><pub-id pub-id-type="doi">10.1016/j.stueduc.2021.101092</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibson</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>P</given-names> </name><name name-style="western"><surname>Black</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Cunningham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Grimm</surname><given-names>MC</given-names> </name><name name-style="western"><surname>McNeil</surname><given-names>HP</given-names> </name></person-group><article-title>Enhancing evaluation in an undergraduate medical education program</article-title><source>Acad Med</source><year>2008</year><month>08</month><volume>83</volume><issue>8</issue><fpage>787</fpage><lpage>793</lpage><pub-id pub-id-type="doi">10.1097/ACM.0b013e31817eb8ab</pub-id><pub-id pub-id-type="medline">18667897</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Litzelman</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Stratos</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Marriott</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Skeff</surname><given-names>KM</given-names> </name></person-group><article-title>Factorial validation of a widely disseminated educational framework for evaluating clinical teachers</article-title><source>Acad Med</source><year>1998</year><month>06</month><volume>73</volume><issue>6</issue><fpage>688</fpage><lpage>695</lpage><pub-id pub-id-type="doi">10.1097/00001888-199806000-00016</pub-id><pub-id pub-id-type="medline">9653408</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noor</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hozan</surname><given-names>CT</given-names> </name><name name-style="western"><surname>V&#x00EE;lceanu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bon&#x021B;ea</surname><given-names>MG</given-names> </name></person-group><article-title>A review of the effectiveness of the role of various components in medical education</article-title><source>Arch Pharm Pract</source><year>2023</year><volume>14</volume><issue>4</issue><fpage>155</fpage><lpage>159</lpage><pub-id pub-id-type="doi">10.51847/LrElkFGJAO</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McGaghie</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Issenberg</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>ER</given-names> </name><name name-style="western"><surname>Barsuk</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Wayne</surname><given-names>DB</given-names> </name></person-group><article-title>Medical education featuring mastery learning with deliberate practice can lead to better health for individuals and populations</article-title><source>Acad Med</source><year>2011</year><month>11</month><volume>86</volume><issue>11</issue><fpage>e8</fpage><lpage>e9</lpage><pub-id pub-id-type="doi">10.1097/ACM.0b013e3182308d37</pub-id><pub-id pub-id-type="medline">22030671</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gould</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Grey</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Huntington</surname><given-names>CG</given-names> </name><etal/></person-group><article-title>Improving patient care outcomes by teaching quality improvement to medical students in community-based practices</article-title><source>Acad Med</source><year>2002</year><month>10</month><volume>77</volume><issue>10</issue><fpage>1011</fpage><lpage>1018</lpage><pub-id pub-id-type="doi">10.1097/00001888-200210000-00014</pub-id><pub-id pub-id-type="medline">12377677</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schiekirka-Schwake</surname><given-names>S</given-names> </name><name name-style="western"><surname>Anders</surname><given-names>S</given-names> </name><name name-style="western"><surname>von Steinb&#x00FC;chel</surname><given-names>N</given-names> </name><name name-style="western"><surname>Becker</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Raupach</surname><given-names>T</given-names> </name></person-group><article-title>Facilitators of high-quality teaching in medical school: findings from a nation-wide survey among clinical teachers</article-title><source>BMC Med Educ</source><year>2017</year><month>09</month><day>29</day><volume>17</volume><issue>1</issue><fpage>178</fpage><pub-id pub-id-type="doi">10.1186/s12909-017-1000-6</pub-id><pub-id pub-id-type="medline">28962568</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schiekirka</surname><given-names>S</given-names> </name><name name-style="western"><surname>Reinhardt</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bei&#x00DF;barth</surname><given-names>T</given-names> </name><name name-style="western"><surname>Anders</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pukrop</surname><given-names>T</given-names> </name><name name-style="western"><surname>Raupach</surname><given-names>T</given-names> </name></person-group><article-title>Estimating learning outcomes from pre- and posttest student self-assessments: a longitudinal study</article-title><source>Acad Med</source><year>2013</year><month>03</month><volume>88</volume><issue>3</issue><fpage>369</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1097/ACM.0b013e318280a6f6</pub-id><pub-id pub-id-type="medline">23348083</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gruppen</surname><given-names>LD</given-names> </name></person-group><article-title>Outcome-based medical education: implications, opportunities, and challenges</article-title><source>Korean J Med Educ</source><year>2012</year><month>12</month><volume>24</volume><issue>4</issue><fpage>281</fpage><lpage>285</lpage><pub-id pub-id-type="doi">10.3946/kjme.2012.24.4.281</pub-id><pub-id pub-id-type="medline">25813324</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harden</surname><given-names>RM</given-names> </name></person-group><article-title>AMEE guide no. 14: outcome-based education: part 1-an introduction to outcome-based education</article-title><source>Med Teach</source><year>1999</year><month>01</month><volume>21</volume><issue>1</issue><fpage>7</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1080/01421599979969</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haverkamp</surname><given-names>N</given-names> </name><name name-style="western"><surname>Barth</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dahmen</surname><given-names>U</given-names> </name><name name-style="western"><surname>Keis</surname><given-names>O</given-names> </name><name name-style="western"><surname>Raupach</surname><given-names>T</given-names> </name></person-group><article-title>Position statement of the GMA committee &#x201C;teaching evaluation&#x201D;</article-title><source>GMS J Med Educ</source><year>2024</year><volume>41</volume><issue>2</issue><fpage>Doc19</fpage><pub-id pub-id-type="doi">10.3205/zma001674</pub-id><pub-id pub-id-type="medline">38779701</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Fraenkel</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Wallen</surname><given-names>NE</given-names> </name><name name-style="western"><surname>Hyun</surname><given-names>HH</given-names> </name></person-group><source>How to Design and Evaluate Research in Education</source><year>2012</year><edition>8</edition><publisher-name>McGraw-Hill</publisher-name></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cook</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Beckman</surname><given-names>TJ</given-names> </name></person-group><article-title>Reflections on experimental research in medical education</article-title><source>Adv Health Sci Educ Theory Pract</source><year>2010</year><month>08</month><volume>15</volume><issue>3</issue><fpage>455</fpage><lpage>464</lpage><pub-id pub-id-type="doi">10.1007/s10459-008-9117-3</pub-id><pub-id pub-id-type="medline">18427941</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Colt</surname><given-names>HG</given-names> </name><name name-style="western"><surname>Davoudi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Murgu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zamanian Rohani</surname><given-names>N</given-names> </name></person-group><article-title>Measuring learning gain during a one-day introductory bronchoscopy course</article-title><source>Surg Endosc</source><year>2011</year><month>01</month><volume>25</volume><issue>1</issue><fpage>207</fpage><lpage>216</lpage><pub-id pub-id-type="doi">10.1007/s00464-010-1161-4</pub-id><pub-id pub-id-type="medline">20585964</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>McGrath</surname><given-names>C</given-names> </name><name name-style="western"><surname>Guerin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Harte</surname><given-names>E</given-names> </name><name name-style="western"><surname>Frearson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Manville</surname><given-names>C</given-names> </name></person-group><article-title>Learning gain in higher education</article-title><source>RAND Corporation</source><year>2015</year><access-date>2026-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.rand.org/pubs/research_reports/RR996.html">https://www.rand.org/pubs/research_reports/RR996.html</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Westphale</surname><given-names>S</given-names> </name><name name-style="western"><surname>Backhaus</surname><given-names>J</given-names> </name><name name-style="western"><surname>Koenig</surname><given-names>S</given-names> </name></person-group><article-title>Quantifying teaching quality in medical education: the impact of learning gain calculation</article-title><source>Med Educ</source><year>2022</year><month>03</month><volume>56</volume><issue>3</issue><fpage>312</fpage><lpage>320</lpage><pub-id pub-id-type="doi">10.1111/medu.14694</pub-id><pub-id pub-id-type="medline">34767274</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x0160;imkovic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tr&#x00E4;uble</surname><given-names>B</given-names> </name></person-group><article-title>Robustness of statistical methods when measure is affected by ceiling and/or floor effect</article-title><source>PLoS One</source><year>2019</year><volume>14</volume><issue>8</issue><fpage>e0220889</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0220889</pub-id><pub-id pub-id-type="medline">31425561</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bereiter</surname><given-names>C</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Harris</surname><given-names>CW</given-names> </name></person-group><article-title>Some persisting dilemmas in the measurement of change</article-title><source>Problems in Measuring Change</source><year>1963</year><publisher-name>University of Wisconsin Press</publisher-name><fpage>3</fpage><lpage>20</lpage></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Prieler</surname><given-names>J</given-names> </name><name name-style="western"><surname>Raven</surname><given-names>J</given-names> </name></person-group><article-title>Problems in the measurement of change (with particular reference to individual change [gain] scores) and their potential solution using IRT</article-title><source>Uses and Abuses of Intelligence: Studies Advancing Spearman and Raven&#x2019;s Quest for Non-Arbitrary Metrics</source><year>2008</year><publisher-name>Royal Fireworks Press</publisher-name><fpage>173</fpage><lpage>210</lpage></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hake</surname><given-names>RR</given-names> </name></person-group><article-title>Interactive-engagement versus traditional methods: a six-thousand-student survey of mechanics test data for introductory physics courses</article-title><source>Am J Phys</source><year>1998</year><month>01</month><volume>66</volume><issue>1</issue><fpage>64</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.1119/1.18809</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coletta</surname><given-names>VP</given-names> </name><name name-style="western"><surname>Phillips</surname><given-names>JA</given-names> </name></person-group><article-title>Interpreting FCI scores: normalized gain, preinstruction scores, and scientific reasoning ability</article-title><source>Am J Phys</source><year>2005</year><month>12</month><volume>73</volume><issue>12</issue><fpage>1172</fpage><lpage>1182</lpage><pub-id pub-id-type="doi">10.1119/1.2117109</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nissen</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Talbot</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Van Dusen</surname><given-names>B</given-names> </name></person-group><article-title>Comparison of normalized gain and Cohen&#x2019;s d for analyzing gains on concept inventories</article-title><source>Phys Rev Phys Educ Res</source><year>2018</year><volume>14</volume><fpage>010115</fpage><pub-id pub-id-type="doi">10.1103/PhysRevPhysEducRes.14.010115</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marx</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Cummings</surname><given-names>K</given-names> </name></person-group><article-title>Normalized change</article-title><source>Am J Phys</source><year>2007</year><volume>75</volume><issue>1</issue><fpage>87</fpage><lpage>91</lpage><pub-id pub-id-type="doi">10.1119/1.2372468</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pentecost</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Barbera</surname><given-names>J</given-names> </name></person-group><article-title>Measuring learning gains in chemical education: a comparison of two methods</article-title><source>J Chem Educ</source><year>2013</year><month>07</month><volume>90</volume><issue>7</issue><fpage>839</fpage><lpage>845</lpage><pub-id pub-id-type="doi">10.1021/ed400018v</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bond</surname><given-names>TG</given-names> </name><name name-style="western"><surname>Fox</surname><given-names>CM</given-names> </name></person-group><source>Applying the Rasch Model: Fundamental Measurement in the Human Sciences</source><year>2015</year><edition>3</edition><publisher-name>Routledge</publisher-name></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Downing</surname><given-names>SM</given-names> </name></person-group><article-title>Item response theory: applications of modern test theory in medical education</article-title><source>Med Educ</source><year>2003</year><month>08</month><volume>37</volume><issue>8</issue><fpage>739</fpage><lpage>745</lpage><pub-id pub-id-type="doi">10.1046/j.1365-2923.2003.01587.x</pub-id><pub-id pub-id-type="medline">12945568</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Embretson</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Reise</surname><given-names>SP</given-names> </name></person-group><source>Item Response Theory</source><year>2013</year><publisher-name>Psychology Press</publisher-name></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wallace</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Bailey</surname><given-names>JM</given-names> </name></person-group><article-title>Do concept inventories actually measure anything?</article-title><source>Astron Educ Rev</source><year>2010</year><volume>9</volume><issue>1</issue><pub-id pub-id-type="doi">10.3847/AER2010024</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Backhaus</surname><given-names>J</given-names> </name><name name-style="western"><surname>Huth</surname><given-names>K</given-names> </name><name name-style="western"><surname>Entwistle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Homayounfar</surname><given-names>K</given-names> </name><name name-style="western"><surname>Koenig</surname><given-names>S</given-names> </name></person-group><article-title>Digital affinity in medical students influences learning outcome: a cluster analytical design comparing vodcast with traditional lecture</article-title><source>J Surg Educ</source><year>2019</year><volume>76</volume><issue>3</issue><fpage>711</fpage><lpage>719</lpage><pub-id pub-id-type="doi">10.1016/j.jsurg.2018.12.001</pub-id><pub-id pub-id-type="medline">30833205</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Lenderking</surname><given-names>W</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wyrwich</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Gelhorn</surname><given-names>H</given-names> </name><name name-style="western"><surname>Revicki</surname><given-names>DA</given-names> </name></person-group><article-title>Is Rasch model analysis applicable in small sample size pilot studies for assessing item characteristics? An example using PROMIS pain behavior item bank data</article-title><source>Qual Life Res</source><year>2014</year><month>03</month><volume>23</volume><issue>2</issue><fpage>485</fpage><lpage>493</lpage><pub-id pub-id-type="doi">10.1007/s11136-013-0487-5</pub-id><pub-id pub-id-type="medline">23912855</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>R Core Team</collab></person-group><article-title>The R Project for Statistical Computing</article-title><source>R Foundation for Statistical Computing</source><year>2013</year><access-date>2026-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://www.R-project.org">http://www.R-project.org</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mair</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hatzinger</surname><given-names>R</given-names> </name></person-group><article-title>Extended Rasch modeling: the eRm package for the application of IRT models in R</article-title><source>J Stat Soft</source><year>2007</year><volume>20</volume><issue>9</issue><fpage>1</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.18637/jss.v020.i09</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Micceri</surname><given-names>T</given-names> </name></person-group><article-title>The unicorn, the normal curve, and other improbable creatures</article-title><source>Psychol Bull</source><year>1989</year><volume>105</volume><issue>1</issue><fpage>156</fpage><lpage>166</lpage><pub-id pub-id-type="doi">10.1037/0033-2909.105.1.156</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bishara</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Hittner</surname><given-names>JB</given-names> </name></person-group><article-title>Testing the significance of a correlation with nonnormal data: comparison of Pearson, Spearman, transformation, and resampling approaches</article-title><source>Psychol Methods</source><year>2012</year><month>09</month><volume>17</volume><issue>3</issue><fpage>399</fpage><lpage>417</lpage><pub-id pub-id-type="doi">10.1037/a0028087</pub-id><pub-id pub-id-type="medline">22563845</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Knief</surname><given-names>U</given-names> </name><name name-style="western"><surname>Forstmeier</surname><given-names>W</given-names> </name></person-group><article-title>Violating the normality assumption may be the lesser of two evils</article-title><source>Behav Res Methods</source><year>2021</year><month>12</month><volume>53</volume><issue>6</issue><fpage>2576</fpage><lpage>2590</lpage><pub-id pub-id-type="doi">10.3758/s13428-021-01587-5</pub-id><pub-id pub-id-type="medline">33963496</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Havlicek</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Peterson</surname><given-names>NL</given-names> </name></person-group><article-title>Robustness of the Pearson correlation against violations of assumptions</article-title><source>Percept Mot Skills</source><year>1976</year><month>12</month><volume>43</volume><issue>3_suppl</issue><fpage>1319</fpage><lpage>1334</lpage><pub-id pub-id-type="doi">10.2466/pms.1976.43.3f.1319</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boerboom</surname><given-names>TBB</given-names> </name><name name-style="western"><surname>Stalmeijer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Dolmans</surname><given-names>DHJM</given-names> </name><name name-style="western"><surname>Jaarsma</surname><given-names>DADC</given-names> </name></person-group><article-title>How feedback can foster professional growth of teachers in the clinical workplace: a review of the literature</article-title><source>Studies in Educational Evaluation</source><year>2015</year><month>09</month><volume>46</volume><fpage>47</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1016/j.stueduc.2015.02.001</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scheeler</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Ruhl</surname><given-names>KL</given-names> </name><name name-style="western"><surname>McAfee</surname><given-names>JK</given-names> </name></person-group><article-title>Providing performance feedback to teachers: a review</article-title><source>Teach Educ Spec Educ</source><year>2004</year><volume>27</volume><issue>4</issue><fpage>396</fpage><lpage>407</lpage><pub-id pub-id-type="doi">10.1177/088840640402700407</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Evans</surname><given-names>C</given-names> </name><name name-style="western"><surname>Howson</surname><given-names>CK</given-names> </name><name name-style="western"><surname>Forsythe</surname><given-names>A</given-names> </name></person-group><article-title>Making sense of learning gain in higher education</article-title><source>Higher Education Pedagogies</source><year>2018</year><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1080/23752696.2018.1508360</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ewert</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sibthorp</surname><given-names>J</given-names> </name></person-group><article-title>Creating outcomes through experiential education: the challenge of confounding variables</article-title><source>Journal of Experiential Education</source><year>2009</year><month>01</month><day>1</day><volume>31</volume><issue>3</issue><fpage>376</fpage><lpage>389</lpage><pub-id pub-id-type="doi">10.5193/JEE.31.3.376</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kubinger</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Gottschall</surname><given-names>CH</given-names> </name></person-group><article-title>Item difficulty of multiple choice tests dependant on different item response formats&#x2014;an experiment in fundamental research on psychological assessment</article-title><source>Psychol Sci</source><year>2007</year><volume>49</volume><issue>4</issue><fpage>361</fpage><lpage>374</lpage></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weber</surname><given-names>F</given-names> </name><name name-style="western"><surname>Knapp</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ickstadt</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kundt</surname><given-names>G</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>&#x00C4;</given-names> </name></person-group><article-title>Zero-cell corrections in random-effects meta-analyses</article-title><source>Res Synth Methods</source><year>2020</year><month>11</month><volume>11</volume><issue>6</issue><fpage>913</fpage><lpage>919</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1460</pub-id><pub-id pub-id-type="medline">32991790</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sweeting</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Sutton</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Lambert</surname><given-names>PC</given-names> </name></person-group><article-title>What to add to nothing? Use and avoidance of continuity corrections in meta-analysis of sparse data</article-title><source>Stat Med</source><year>2004</year><month>05</month><day>15</day><volume>23</volume><issue>9</issue><fpage>1351</fpage><lpage>1375</lpage><pub-id pub-id-type="doi">10.1002/sim.1761</pub-id><pub-id pub-id-type="medline">15116347</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aung</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Jurak</surname><given-names>I</given-names> </name><name name-style="western"><surname>Mehmood</surname><given-names>S</given-names> </name><name name-style="western"><surname>Axon</surname><given-names>E</given-names> </name></person-group><article-title>Sensitivity analysis in meta-analysis: a tutorial</article-title><source>Cochrane Evid Synth Methods</source><year>2026</year><month>01</month><volume>4</volume><issue>1</issue><fpage>e70067</fpage><pub-id pub-id-type="doi">10.1002/cesm.70067</pub-id><pub-id pub-id-type="medline">41497796</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Neill</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Gregg</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Peabody</surname><given-names>MR</given-names> </name></person-group><article-title>Effect of sample size on common item equating using the dichotomous Rasch model</article-title><source>Appl Meas Educ</source><year>2020</year><month>01</month><volume>33</volume><issue>1</issue><fpage>10</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1080/08957347.2019.1674309</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Downing</surname><given-names>SM</given-names> </name></person-group><article-title>Reliability: on the reproducibility of assessment data</article-title><source>Med Educ</source><year>2004</year><month>09</month><volume>38</volume><issue>9</issue><fpage>1006</fpage><lpage>1012</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2929.2004.01932.x</pub-id><pub-id pub-id-type="medline">15327684</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tavakol</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dennick</surname><given-names>R</given-names> </name></person-group><article-title>Making sense of Cronbach&#x2019;s alpha</article-title><source>Int J Med Educ</source><year>2011</year><month>06</month><day>27</day><volume>2</volume><fpage>53</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.5116/ijme.4dfb.8dfd</pub-id><pub-id pub-id-type="medline">28029643</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Vet</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Mokkink</surname><given-names>LB</given-names> </name><name name-style="western"><surname>Mosmuller</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Terwee</surname><given-names>CB</given-names> </name></person-group><article-title>Spearman-Brown prophecy formula and Cronbach&#x2019;s alpha: different faces of reliability and opportunities for new applications</article-title><source>J Clin Epidemiol</source><year>2017</year><month>05</month><volume>85</volume><fpage>45</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2017.01.013</pub-id><pub-id pub-id-type="medline">28342902</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>W</given-names> </name></person-group><article-title>Some experimental results in the correlation of mental abilities</article-title><source>Br J Psychol 1904-1920</source><year>1910</year><month>10</month><volume>3</volume><issue>3</issue><fpage>296</fpage><lpage>322</lpage><pub-id pub-id-type="doi">10.1111/j.2044-8295.1910.tb00207.x</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spearman</surname><given-names>C</given-names> </name></person-group><article-title>Correlation calculated from faulty data</article-title><source>Br J Psychol 1904-1920</source><year>1910</year><month>10</month><volume>3</volume><issue>3</issue><fpage>271</fpage><lpage>295</lpage><pub-id pub-id-type="doi">10.1111/j.2044-8295.1910.tb00206.x</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rindskopf</surname><given-names>D</given-names> </name></person-group><article-title>Overview of Bayesian statistics</article-title><source>Eval Rev</source><year>2020</year><month>08</month><volume>44</volume><issue>4</issue><fpage>225</fpage><lpage>237</lpage><pub-id pub-id-type="doi">10.1177/0193841X19895623</pub-id><pub-id pub-id-type="medline">31894697</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Decision framework for selecting the calibration parameter &#x00B5; in Weighted Gain Score calculations according to the evaluation objective (absolute adjustment, relative adjustment, or routine evaluation).</p><media xlink:href="mededu_v12i1e75516_app1.docx" xlink:title="DOCX File, 194 KB"/></supplementary-material></app-group></back></article>