<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Educ</journal-id><journal-id journal-id-type="publisher-id">mededu</journal-id><journal-id journal-id-type="index">20</journal-id><journal-title>JMIR Medical Education</journal-title><abbrev-journal-title>JMIR Med Educ</abbrev-journal-title><issn pub-type="epub">2369-3762</issn></journal-meta><article-meta><article-id pub-id-type="publisher-id">51282</article-id><article-id pub-id-type="doi">10.2196/51282</article-id><title-group><article-title>Assessing GPT-4&#x2019;s Performance in Delivering Medical Advice: Comparative Analysis With Human Experts</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Jo</surname><given-names>Eunbeen</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Song</surname><given-names>Sanghoun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Jong-Ho</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lim</surname><given-names>Subin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Ju Hyeon</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cha</surname><given-names>Jung-Joon</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Young-Min</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Joo</surname><given-names>Hyung Joon</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Medical Informatics, Korea University College of Medicine</institution>, <addr-line>Seoul</addr-line>, <country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Linguistics, Korea University</institution>, <addr-line>Seoul</addr-line>, <country>Republic of Korea</country></aff><aff id="aff3"><institution>Korea University Research Institute for Medical Bigdata Science, Korea University</institution>, <addr-line>Seoul</addr-line>, <country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Cardiology, Cardiovascular Center, Korea University College of Medicine</institution>, <addr-line>Seoul</addr-line>, <country>Republic of Korea</country></aff><aff id="aff5"><institution>Division of Cardiology, Department of Internal Medicine, Korea University Anam Hospital</institution>, <addr-line>Seoul</addr-line>, <country>Republic of Korea</country></aff><aff id="aff6"><institution>School of Interdisciplinary Industrial Studies, Hanyang University</institution>, <addr-line>Seoul</addr-line>, <country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Cardoso</surname><given-names>Taiane de Azevedo</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mihalache</surname><given-names>Andrew</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chatzimina</surname><given-names>Maria</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hyung Joon Joo, MD, PhD<email>drjoohj@gmail.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>8</day><month>7</month><year>2024</year></pub-date><volume>10</volume><elocation-id>e51282</elocation-id><history><date date-type="received"><day>26</day><month>07</month><year>2023</year></date><date date-type="rev-recd"><day>10</day><month>04</month><year>2024</year></date><date date-type="accepted"><day>19</day><month>04</month><year>2024</year></date></history><copyright-statement>&#x00A9; Eunbeen Jo, Sanghoun Song, Jong-Ho Kim, Subin Lim, Ju Hyeon Kim, Jung-Joon Cha, Young-Min Kim, Hyung Joon Joo. Originally published in JMIR Medical Education (<ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org">https://mededu.jmir.org</ext-link>), 8.7.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Education, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mededu.jmir.org/">https://mededu.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mededu.jmir.org/2024/1/e51282"/><abstract><sec><title>Background</title><p>Accurate medical advice is paramount in ensuring optimal patient care, and misinformation can lead to misguided decisions with potentially detrimental health outcomes. The emergence of large language models (LLMs) such as OpenAI&#x2019;s GPT-4 has spurred interest in their potential health care applications, particularly in automated medical consultation. Yet, rigorous investigations comparing their performance to human experts remain sparse.</p></sec><sec><title>Objective</title><p>This study aims to compare the medical accuracy of GPT-4 with human experts in providing medical advice using real-world user-generated queries, with a specific focus on cardiology. It also sought to analyze the performance of GPT-4 and human experts in specific question categories, including drug or medication information and preliminary diagnoses.</p></sec><sec sec-type="methods"><title>Methods</title><p>We collected 251 pairs of cardiology-specific questions from general users and answers from human experts via an internet portal. GPT-4 was tasked with generating responses to the same questions. Three independent cardiologists (SL, JHK, and JJC) evaluated the answers provided by both human experts and GPT-4. Using a computer interface, each evaluator compared the pairs and determined which answer was superior, and they quantitatively measured the clarity and complexity of the questions as well as the accuracy and appropriateness of the responses, applying a 3-tiered grading scale (low, medium, and high). Furthermore, a linguistic analysis was conducted to compare the length and vocabulary diversity of the responses using word count and type-token ratio.</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4 and human experts displayed comparable efficacy in medical accuracy (&#x201C;GPT-4 is better&#x201D; at 132/251, 52.6% vs &#x201C;Human expert is better&#x201D; at 119/251, 47.4%). In accuracy level categorization, humans had more high-accuracy responses than GPT-4 (50/237, 21.1% vs 30/238, 12.6%) but also a greater proportion of low-accuracy responses (11/237, 4.6% vs 1/238, 0.4%; <italic>P</italic>=.001). GPT-4 responses were generally longer and used a less diverse vocabulary than those of human experts, potentially enhancing their comprehensibility for general users (sentence count: mean 10.9, SD 4.2 vs mean 5.9, SD 3.7; <italic>P</italic>&#x003C;.001; type-token ratio: mean 0.69, SD 0.07 vs mean 0.79, SD 0.09; <italic>P</italic>&#x003C;.001). Nevertheless, human experts outperformed GPT-4 in specific question categories, notably those related to drug or medication information and preliminary diagnoses. These findings highlight the limitations of GPT-4 in providing advice based on clinical experience.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4 has shown promising potential in automated medical consultation, with comparable medical accuracy to human experts. However, challenges remain particularly in the realm of nuanced clinical judgment. Future improvements in LLMs may require the integration of specific clinical reasoning pathways and regulatory oversight for safe use. Further research is needed to understand the full potential of LLMs across various medical specialties and conditions.</p></sec></abstract><kwd-group><kwd>GPT-4</kwd><kwd>medical advice</kwd><kwd>ChatGPT</kwd><kwd>cardiology</kwd><kwd>cardiologist</kwd><kwd>heart</kwd><kwd>advice</kwd><kwd>recommendation</kwd><kwd>recommendations</kwd><kwd>linguistic</kwd><kwd>linguistics</kwd><kwd>artificial intelligence</kwd><kwd>NLP</kwd><kwd>natural language processing</kwd><kwd>chatbot</kwd><kwd>chatbots</kwd><kwd>conversational agent</kwd><kwd>conversational agents</kwd><kwd>response</kwd><kwd>responses</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>As a large language model (LLM), the GPT developed by OpenAI generates human-like text [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>], distinguishing it from other specialized deep learning models that are limited to solving specific problems within predetermined domains [<xref ref-type="bibr" rid="ref4">4</xref>]. In the medical field, GPT has the potential to augment medical education [<xref ref-type="bibr" rid="ref5">5</xref>], provide clinical decision support [<xref ref-type="bibr" rid="ref6">6</xref>], and enhance public health initiatives [<xref ref-type="bibr" rid="ref7">7</xref>]. An impressive achievement of GPT-3.5 is its success in meeting the passing threshold for the United States Medical Licensing Examination [<xref ref-type="bibr" rid="ref8">8</xref>], demonstrating its ability to offer medical advice comparable to that of trained professionals [<xref ref-type="bibr" rid="ref9">9</xref>]. The latest iteration, GPT-4 [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], is anticipated to exhibit advancements in processing complex medical language, formulating patient care suggestions, and making preliminary diagnostic predictions, which inspires cautious optimism for its future applications in the medical domain [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Cardiovascular diseases are a leading cause of death worldwide, highlighting the critical need for precise and reliable information in this domain [<xref ref-type="bibr" rid="ref13">13</xref>]. During the initial stages of the SARS-CoV-2 pandemic, overstated claims about the cardiovascular implications of the virus potentially escalated public unease and undermined trust in empirical findings [<xref ref-type="bibr" rid="ref14">14</xref>]. The distribution of speculative or inaccurate information would have had a detrimental effect on the pandemic response strategies. It is paramount to emphasize that inaccuracies or misconceptions in cardiological advice can lead to severe consequences. Hence, there is a pressing need for rigorous validation of all sources of information, whether derived from human experts or advanced computational models such as GPT-4.</p><p>Moreover, the generation of &#x201C;hallucinatory&#x201D; or erroneous responses by GPT raises concerns about nonmedical expert users unintentionally accepting incorrect information as valid [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Consequently, proposals for regulatory oversight of LLMs have emerged, including the establishment of a new regulatory category specifically addressing LLM-related challenges and risks [<xref ref-type="bibr" rid="ref4">4</xref>]. Therefore, it is crucial to develop auditing procedures capable of capturing the intricacies of LLM-associated risks, necessitating a balanced evaluation of the potential benefits and risks inherent in LLMs [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. To delve deeper into this matter, this study applied real-world health consultations from general users to human experts through an internet portal, using the most recent iteration of this technology, GPT-4. The responses provided by both human experts and GPT-4 were subsequently evaluated by a panel of 3 independent cardiologists to gain a nuanced understanding of the potential benefits and risks associated with GPT-4.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Collection</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the study design. We collected question-and-answer data related to cardiology from the Korean search portal NAVER, focusing on 264 cases. NAVER is Korea&#x2019;s largest search engine, and its web-based questions and answers forums, called &#x201C;Jisik-In,&#x201D; have previously been used in medical research [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. The data set covered the period from July 13, 2020, to July 12, 2021, and included medical inquiries posed by portal users and the corresponding responses provided by human experts. These experts are doctors who have graduated from a college of medicine or medical school, passed the Korean Medical Licensing Examination, and hold legal accreditations as certified specialists in their respective medical fields from the Ministry of Health and Welfare. They are not restricted by character limits when answering users&#x2019; questions on the portal site. The questions were categorized into 2 types: binary and open-ended. Further, 6 distinct categories were defined based on the questions&#x2019; intent. All collected data were in Korean text form. To ensure the analysis was focused on sufficiently detailed and substantive exchanges, we specifically selected questions that contained more than 100 characters according to the Korean alphabet and answers provided by human experts that comprised at least 200 characters. This approach was aimed at filtering out overly simple queries and ensuring that the responses were elaborate enough for a thorough comparison. Additionally, to maintain a consistent and fair comparison basis between the capabilities of GPT-4 and human experts, we excluded 13 cases from the total data set that contained multimedia content such as videos or images. Finally, 251 cases were selected for the study after applying these criteria.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study design and evaluation process. A data set consisting of 251 cardiology-specific question-answer pairs was collected from the NAVER portal over a 1-year period, from July 13, 2020, to July 12, 2021. A licensed medical professional is the person who answered the portal user&#x2019;s question. The questions covered 6 domain categories and included both binary and open-ended types. From May 5 to 8, these questions were inputted into GPT-4 to generate the corresponding GPT-4 responses. Following that, a panel of 3 cardiologists reviewed and evaluated the questions along with the answers provided by human experts and GPT-4. The evaluation criteria focused on assessing the complexity and clarity of the questions as well as the accuracy and appropriateness of the responses from both human experts and GPT-4.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e51282_fig01.png"/></fig></sec><sec id="s2-2"><title>GPT Answer Generation</title><p>Answers to the collected questions were generated using OpenAI&#x2019;s GPT-4 model, released on March 14, 2023 [<xref ref-type="bibr" rid="ref10">10</xref>]. From May 5 to 8, 2023, a total of 3 researchers used this model via the OpenAI website to generate GPT-4 answers. The total data set of questions to be entered into the GPT-4 was distributed to the 3 researchers in the form of a spreadsheet. Each original Korean question was directly fed into the GPT-4 prompt without any supplementary input. The researchers saved the generated answer in a spreadsheet. Each question input was done in a new session by clicking the &#x201C;New chat&#x201D; button.</p></sec><sec id="s2-3"><title>Question and Answer Evaluation</title><p>Once the data were randomly shuffled, answers from both GPT-4 and human experts were anonymized and labeled as answer 1 and answer 2, respectively, ensuring the 3 independent cardiologist reviewers were blinded to the source of each response. Each of these reviewers is a board-certified physician in internal medicine and has undergone more than 4 years of fellow training in cardiology subspecialty. A panel of 3 cardiologists assessed the question set along with the anonymized answers. The evaluation was conducted using a computer interface. Each evaluator assessed the clarity and complexity of the questions as well as the accuracy and appropriateness of the answers. To quantitatively measure these aspects, a 3-tiered grading scale (low, medium, and high) was used (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Additionally, each evaluator determined which answer (the GPT-4&#x2019;s answer or the human expert&#x2019;s answer) showed superior accuracy and appropriateness in relation to the question posed.</p><p>To further elucidate, the Kendall <italic>W</italic> concordance analysis revealed the following coefficient values indicating the level of agreement among the evaluators: 0.44 for the appropriateness of the human expert answers, 0.40 for the appropriateness of the GPT-4 answers, 0.43 for the medical accuracy of the human expert answers, and 0.40 for the medical accuracy of the GPT answers. Moreover, when making a binary choice determining the superiority of appropriateness between the human expert and GPT-4 answers, the coefficient was 0.42, and for determining the superiority of medical accuracy between the two, it was 0.45. These values, falling in the range of 0.40-0.60, denote a moderate agreement, showcasing a significant level of reliability in our study findings.</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This research project was approved by the institutional review board of Korea University Anam Hospital (IRB 2023AN0280). The research was conducted in accordance with the Helsinki Declaration. Informed consent was obtained from all 3 participating cardiologists.</p></sec><sec id="s2-5"><title>Linguistic Analysis</title><p>The Korean Sentence Separator 4.5.1 was used to segment the text into individual sentences. For text tokenization, the Korean medical bidirectional encoder representations from the transformer model, which was specifically designed for Korean medical text analysis, was used [<xref ref-type="bibr" rid="ref21">21</xref>]. To evaluate lexical diversity, the type-token ratio (TTR) was computed for each set of responses [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. The TTR, which represents the ratio of unique words to the total number of words in a text, was determined after the responses were tokenized [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>To discern statistically significant differences across categorical outcomes, we used the chi-square test or Fisher exact test as appropriate, depending on the expected frequencies within the categories. For continuous variables, comparison across groups was conducted using either the parametric unpaired 2-tailed <italic>t</italic> test or the nonparametric Mann-Whitney test, based on the distribution of the data. Interrater agreement among the 3 cardiologist evaluators was quantitatively assessed using the Kendall <italic>W</italic> concordance analysis. The association between the complexity and clarity of questions and the quality of responses was investigated using the Spearman rank correlation coefficient. All statistical analyses were conducted using SAS 9.4 (SAS Institute Inc) and R program (version 3.6.1; R Foundation for Statistical Computing).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Both the number of words and sentences per answer were significantly higher for GPT-4 answers than for human expert answers (word count: mean 190, SD 75.2 for GPT-4 vs mean 139, SD 95.6 for humans; <italic>P</italic>&#x003C;.001 and sentence count: mean 10.9, SD 4.2 for GPT-4 vs mean 5.9, SD 3.7 for humans; <italic>P</italic>&#x003C;.001; <xref ref-type="table" rid="table1">Table 1</xref>). The GPT-4 answers exhibited lower lexical diversity, as measured by the TTR, compared to the answers provided by human experts. This suggests that GPT-4 answers may be perceived as more comprehensible and similar to human conversations rather than written text (TTR: mean 0.69, SD 0.07 for GPT-4 vs mean 0.79, SD 0.09 for humans; <italic>P</italic>&#x003C;.001).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Linguistic difference between GPT-4 and human expert answers.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Characteristics</td><td align="left" valign="top">GPT-4, mean (SD)</td><td align="left" valign="top">Human, mean (SD)</td><td align="left" valign="top"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Word count per answer</td><td align="left" valign="top">190 (75.2)</td><td align="left" valign="top">139 (95.6)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Sentence count per answer</td><td align="left" valign="top">10.9 (4.2)</td><td align="left" valign="top">5.9 (3.7)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Type-token ratio</td><td align="left" valign="top">0.69 (0.07)</td><td align="left" valign="top">0.79 (0.09)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table></table-wrap><p><xref ref-type="fig" rid="figure2">Figure 2</xref> presents an analysis of the medical accuracy between GPT-4 and human expert answers. When cardiologists were asked to evaluate which answers were more medically accurate, the responses slightly favored the human expert answers (132/251, 52.6% vs 119/251, 47.4%; <italic>P</italic>=.41; <xref ref-type="fig" rid="figure2">Figure 2A</xref>). Dividing medical accuracy into low, medium, and high levels, a significant proportion of human expert answers were ranked as highly accurate compared to GPT-4 (50/237, 21.1% vs 30/238, 12.6%; <italic>P</italic>&#x003C;.001; <xref ref-type="fig" rid="figure2">Figure 2B</xref>). However, the rate of low accuracy was also higher for the human expert answers (11/237, 4.6% vs 1/238, 0.4%; <italic>P</italic>=.007). This counterintuitive observation underscores the potential of LLMs to bridge gaps in human work in real-world scenarios.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Medical accuracy between GPT-4 and human expert answers. (A) Survey results indicating preference for GPT-4 and human expert answers based on perceived medical accuracy. (B) Analysis of perceived medical accuracy, categorized as low, medium, and high for both GPT-4 and human expert answers. (C and D) Relationship between question complexity or clarity and the perceived medical accuracy of GPT-4 and human expert answers. (E) Comparison of variations in perceived medical accuracy between GPT-4 and human expert answers, depending on question type. (F) Comparison of perceived medical accuracy between GPT-4 and human expert answers across different categories of question intent. (G and H) Comparison of word count per answer and type-token ratio between human expert and GPT-4 answers when evaluated for medical accuracy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e51282_fig02.png"/></fig><p>In terms of question complexity and ambiguity, GPT-4 demonstrates an advantage. The more complex and ambiguous the question, the higher the medical accuracy of GPT-4&#x2019;s answers. Conversely, human experts excel in dealing with simpler and clearer questions, although without statistically significant differences (<italic>P</italic>=.19; <xref ref-type="fig" rid="figure2">Figure 2C</xref> and <italic>P</italic>=.30; <xref ref-type="fig" rid="figure2">Figures 2D</xref>, <xref ref-type="fig" rid="figure3">3C</xref>, and <xref ref-type="fig" rid="figure3">3D</xref>). The difference in medical accuracy between human and GPT-4 answers remained below 10% across different question types (<italic>P</italic>=.39; <xref ref-type="fig" rid="figure2">Figure 2E</xref>).</p><p>Interestingly, when analyzing question categories based on the intent, numerical differences were observed, but without statistical significance (<italic>P</italic>=.20; <xref ref-type="fig" rid="figure2">Figure 2F</xref>). Human experts outperformed GPT-4 in responding to questions related to drugs or medications and preliminary diagnoses, scoring higher than GPT-4 (drug or medication: 12/18, 66.7% vs 6/18, 33.3% and preliminary diagnosis: 43/70, 61.4% vs 27/70, 38.6%). Conversely, GPT-4 surpassed human experts in addressing queries regarding the necessity of hospital visits and guidance for clinical departments (hospital visit necessity: 9/22, 40.9% vs 13/22, 59.1% and clinical department guidance: 15/33, 45.5% vs 18/33, 54.5%).</p><p>In the linguistic analysis, when the medical accuracy of a human expert&#x2019;s answer exceeded that of GPT-4, the human expert&#x2019;s answers typically had a higher word count and lower TTR compared to cases where GPT-4&#x2019;s answers were deemed more medically accurate (word count per answer: mean 162, SD 102.6 vs mean 114, SD 80.3; <italic>P</italic>&#x003C;.001; <xref ref-type="fig" rid="figure2">Figure 2G</xref> and TTR: mean 0.78, SD 0.09 vs mean 0.80, SD 0.09; <italic>P</italic>=.02; <xref ref-type="fig" rid="figure2">Figure 2H</xref>). This implies that the more the response resembles a real conversation&#x2014;longer and easier&#x2014;the higher the perceived medical accuracy according to cardiology experts. This observation indicates a potential area for quality control in human expert responses and highlights the consistent performance of GPT-4 in terms of response length and lexical variation.</p><p>Next, a comparative analysis between GPT-4 and human expert answers was conducted in terms of answer appropriateness (<xref ref-type="fig" rid="figure3">Figure 3</xref>). When assessing whether GPT-4 or human expert answers were more appropriate for the posed questions, GPT-4 was rated as superior (GPT-4: 135/251, 53.8% vs humans: 116/251, 46.2%; <italic>P</italic>=.23; <xref ref-type="fig" rid="figure3">Figure 3A</xref>). Similar to the medical accuracy analysis, when categorizing appropriateness into low, medium, and high, both GPT-4 and human expert answers showed a comparable distribution across these segments (<italic>P</italic>=.26; <xref ref-type="fig" rid="figure3">Figure 3B</xref>). Notably, mirroring the findings from the medical accuracy analysis, the frequency of answers deemed to have low appropriateness was numerically higher for human experts (7/240, 2.9% vs 2/241, 0.8%; <italic>P</italic>=.03), suggesting the possibility of human shortcomings. The investigations related to question complexity, clarity, and type displayed numerical trends similar to those observed in the medical accuracy analysis, although no statistical differences were observed (<italic>P</italic>=.20; <italic>P</italic>=.60; and <italic>P</italic>=.66; <xref ref-type="fig" rid="figure3">Figure 3C-E</xref>). The analysis based on question intent showed no significant statistical discrepancies between the proportions of cases where human expert answers were deemed more appropriate and those where GPT-4 answers were considered more appropriate. Interestingly, GPT-4 was rated as more appropriate than human experts in all other categories, except for the question category of preliminary diagnosis (<italic>P</italic>=.58; <xref ref-type="fig" rid="figure3">Figure 3F</xref>). When human expert answers were considered more appropriate than those of GPT-4, the corresponding answers had a higher word count and lower TTR compared to cases where GPT-4 answers were deemed more appropriate (word count per answer: mean 121, SD 79.3 vs mean 160, SD 108.1; <italic>P</italic>=.001; <xref ref-type="fig" rid="figure3">Figure 3G</xref> and TTR: mean 0.80, SD 0.09 vs mean 0.77, SD 0.09; <italic>P</italic>=.02; <xref ref-type="fig" rid="figure3">Figure 3H</xref>). Similar to medical accuracy, these findings suggest that longer responses resembling genuine conversations are evaluated as more appropriate.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Answer appropriateness between GPT-4 and human expert answers. (A) Survey results indicating preference for GPT-4 and human expert responses based on perceived answer appropriateness. (B) Analysis of perceived answer appropriateness, categorized as low, medium, and high for both GPT-4 and human expert answers. (C and D) Relationship between question complexity or clarity and the perceived answer appropriateness of GPT-4 and human expert answers. (E) Comparison of variations in perceived answer appropriateness between GPT-4 and human expert answers depending on question type. (F) Comparison of perceived answer appropriateness between GPT-4 and human expert answers across different categories of question intent. (G and H) Comparison of word count per answer and type-token ratio between human expert and GPT-4 answers when evaluated for appropriateness.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e51282_fig03.png"/></fig><p>For the 251 questions assessed, all 3 independent cardiologists rated the GPT-4 answers as superior in 18% (45/251) of cases in terms of medical accuracy. In an additional 29% (74/251) of the cases, the majority (2 of 3) of cardiologists endorsed the GPT-4 answers. Conversely, human expert answers were unanimously considered more accurate in 20% (50/251) of cases, with the majority of cardiologists agreeing with human experts in 33% (82/251) of cases (<xref ref-type="fig" rid="figure4">Figure 4</xref>). In terms of answer appropriateness, all 3 cardiologists agreed that the GPT-4 answers were superior in 15% (38/251) of cases. The majority of cardiologists found GPT-4 answers to be more appropriate in another 39% (97/251) of cases. Human experts, however, received unanimous approval for the appropriateness of their answers in 18% (70/251) of cases and majority approval in an additional 28% (46/251; <xref ref-type="fig" rid="figure5">Figure 5</xref>). These figures highlight the noteworthy performance of GPT-4 from a medical standpoint. Examining illustrative cases, GPT-4 stands out for delivering medical information resembling the content of medical textbooks and dictionaries. Additionally, GPT-4 demonstrates strength in thoroughly addressing every user&#x2019;s question, leaving no queries unanswered. In contrast, human experts leverage their advantage in providing heuristic information informed by their clinical experience, especially when questions require elements of clinical judgment.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Evaluation result and representative cases comparing medical accuracy between GPT-4 and human expert answers. (A) A case where the GPT-4 answer received superior medical accuracy ratings from all 3 evaluators. (B) A case where a human expert received superior medical accuracy ratings from all 3 evaluators.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e51282_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Evaluation result and representative cases comparing answer appropriateness between GPT-4 and human expert answers. (A) A case where the GPT-4 answer received superior appropriateness ratings from all 3 evaluators. (B) A case where a human expert answer received superior appropriateness ratings from all 3 evaluators.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mededu_v10i1e51282_fig05.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This research uniquely implemented real-world health consultations involving general users and human experts, comparing the answers provided by human experts and GPT-4. Three independent cardiologists appraised the answers to discern the potential advantages and disadvantages of using GPT-4 in the medical advice domain. This study demonstrated comparable levels of medical accuracy between GPT-4 and human experts. Notably, human expert answers had a higher proportion of answers classified as having low medical accuracy compared to those from GPT-4.</p><p>Another significant finding suggests the benefits of articulating medical advice in a conversational style, which positively impacts medical accuracy and relevance to queries. This style proved effective in responding to all questionnaire requests, leading to higher answer ratings and demonstrating the potential of GPT-4 in providing medical advice. Notably, GPT-4&#x2019;s answers consistently displayed appropriate length and lexical variation compared to those of human experts. The findings of this study underscore the potential of GPT-4 in medical education, particularly in enhancing the learning experience through its ability to simulate conversational medical advice with accuracy comparable to human experts. Integrating GPT-4 into educational frameworks could offer an innovative approach to medical education, facilitating adaptive learning and preparing students for the digital evolution in health care. This suggests a promising avenue for future research and application in the field of medical education, highlighting the importance of incorporating advanced AI tools like GPT-4 to complement traditional educational methods.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>An important consideration is the linguistic scope of our findings. This study was conducted in Korean, which naturally raises questions about its generalizability to other languages. Recent studies and OpenAI&#x2019;s own documentation suggest that GPT-4&#x2019;s performance in non-English languages, including medical contexts, has improved compared to previous versions [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Takagi et al [<xref ref-type="bibr" rid="ref24">24</xref>] compared the performance of GPT-3.5 and GPT-4 using 254 questions from the Japanese Medical Licensing Examination, revealing that GPT-4 exhibited a 29.1% improvement over GPT-3.5. They highlighted that GPT-4&#x2019;s enhanced non-English language processing capabilities were instrumental in its ability to pass the medical licensing examination. In addition, Wang et al [<xref ref-type="bibr" rid="ref25">25</xref>] conducted a study comparing the performance of GPT-3.5 and GPT-4 on English and Chinese data sets for the Chinese Medical Licensing Examination, showing a significant improvement in accuracy for Chinese compared to English. This study showed that the medical advice provided by GPT-4 was comparable in medical accuracy to that provided by human experts. Based on previous research and the findings of this study, it has been found that GPT-4 can effectively process specialized medical information in various non-English languages, including Korean. This indicates its potential for use in patient education and the dissemination of medical knowledge.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>Despite its strengths, GPT-4&#x2019;s capability to provide advice based on clinical experience differs notably from that of human experts. Furthermore, quantitative analysis revealed potential discrepancies between GPT-4 and human expert responses, depending on the intent of the question. Numerous studies are currently underway to identify appropriate regulatory measures for the use of LLMs [<xref ref-type="bibr" rid="ref4">4</xref>]. The findings of this investigation are anticipated to facilitate subsequent research aimed at identifying tasks in the medical field that GPT-4 excels in. This, in turn, could expedite the development of technology to enhance the quality of medical services and promote public health.</p><p>This study has several limitations to consider. First, its focus on cardiology might limit the generalizability of the results to other medical specialties. Second, the sample size for the answer evaluation, which consisted of only 3 cardiologists, could have been larger for a more robust analysis. Furthermore, since the evaluations were conducted solely by cardiologists, there is potential for reporting bias where certain aspects of the answers might be overemphasized or underrepresented. Inclusion of professionals from other domains could have provided a broader assessment. Future studies should aim to involve larger sample sizes and encompass a wider range of medical specialties. Moreover, integrating patients&#x2019; perspectives could offer further insights into the acceptability and perceived utility of artificial intelligence&#x2013;powered medical advice.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In conclusion, this study revealed the promising capabilities of GPT-4 in providing medically accurate and appropriate responses comparable to human experts. The additional benefits of GPT-4 include consistent proficiency in maintaining appropriate response length and lexical variation. However, GPT-4 showed some disadvantages in providing advice based on clinical experience as well as variation in its performance depending on question intent. Despite these challenges, this study suggests that LLMs such as GPT-4 hold significant potential in augmenting medical education, providing medical advice.</p></sec></sec></body><back><ack><p>This research was supported by a grant of the Medical Data&#x2013;Driven Hospital Support Project through the Korea Health Information Service and funded by the Ministry of Health and Welfare, Republic of Korea.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb2">TTR</term><def><p>type-token ratio</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alberts</surname><given-names>IL</given-names> </name><name name-style="western"><surname>Mercolli</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pyka</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models (LLM) and Chatgpt: what will the impact on nuclear medicine be?</article-title><source>Eur J Nucl Med Mol Imaging</source><year>2023</year><month>05</month><volume>50</volume><issue>6</issue><fpage>1549</fpage><lpage>1552</lpage><pub-id pub-id-type="doi">10.1007/s00259-023-06172-w</pub-id><pub-id pub-id-type="medline">36892666</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nath</surname><given-names>S</given-names> </name><name name-style="western"><surname>Marie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ellershaw</surname><given-names>S</given-names> </name><name name-style="western"><surname>Korot</surname><given-names>E</given-names> </name><name name-style="western"><surname>Keane</surname><given-names>PA</given-names> </name></person-group><article-title>New meaning for NLP: the trials and tribulations of natural language processing with GPT-3 in ophthalmology</article-title><source>Br J Ophthalmol</source><year>2022</year><month>07</month><volume>106</volume><issue>7</issue><fpage>889</fpage><lpage>892</lpage><pub-id pub-id-type="doi">10.1136/bjophthalmol-2022-321141</pub-id><pub-id pub-id-type="medline">35523534</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Floridi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chiriatti</surname><given-names>M</given-names> </name></person-group><article-title>GPT-3: its nature, scope, limits, and consequences</article-title><source>Minds Mach</source><year>2020</year><month>12</month><volume>30</volume><issue>4</issue><fpage>681</fpage><lpage>694</lpage><pub-id pub-id-type="doi">10.1007/s11023-020-09548-1</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>The imperative for regulatory oversight of large language models (or generative AI) in healthcare</article-title><source>NPJ Digit Med</source><year>2023</year><month>07</month><day>6</day><volume>6</volume><issue>1</issue><fpage>120</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00873-0</pub-id><pub-id pub-id-type="medline">37414860</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>AlSaad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alhuwail</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>1</day><volume>9</volume><fpage>e48291</fpage><pub-id pub-id-type="doi">10.2196/48291</pub-id><pub-id pub-id-type="medline">37261894</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>K</given-names> </name></person-group><article-title>Artificial intelligence in intensive care medicine: toward a ChatGPT/GPT-4 way?</article-title><source>Ann Biomed Eng</source><year>2023</year><month>09</month><volume>51</volume><issue>9</issue><fpage>1898</fpage><lpage>1903</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03234-w</pub-id><pub-id pub-id-type="medline">37179277</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biswas</surname><given-names>SS</given-names> </name></person-group><article-title>Role of ChatGPT in public health</article-title><source>Ann Biomed Eng</source><year>2023</year><month>05</month><volume>51</volume><issue>5</issue><fpage>868</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03172-7</pub-id><pub-id pub-id-type="medline">36920578</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name></person-group><article-title>The potential impact of ChatGPT/GPT-4 on surgery: will it topple the profession of surgeons?</article-title><source>Int J Surg</source><year>2023</year><month>05</month><day>1</day><volume>109</volume><issue>5</issue><fpage>1545</fpage><lpage>1547</lpage><pub-id pub-id-type="doi">10.1097/JS9.0000000000000388</pub-id><pub-id pub-id-type="medline">37037587</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>GPT-4 is OpenAI&#x2019;s most advanced system, producing safer and more useful responses</article-title><source>OpenAI</source><access-date>2023-03-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/gpt-4">https://openai.com/gpt-4</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><collab>OpenAI</collab></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goktas</surname><given-names>P</given-names> </name><name name-style="western"><surname>Karakaya</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kalyoncu</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Damadoglu</surname><given-names>E</given-names> </name></person-group><article-title>Artificial intelligence chatbots in allergy and immunology practice: where have we been and where are we going?</article-title><source>J Allergy Clin Immunol Pract</source><year>2023</year><month>09</month><volume>11</volume><issue>9</issue><fpage>2697</fpage><lpage>2700</lpage><pub-id pub-id-type="doi">10.1016/j.jaip.2023.05.042</pub-id><pub-id pub-id-type="medline">37301435</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mensah</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Roth</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Fuster</surname><given-names>V</given-names> </name></person-group><article-title>The global burden of cardiovascular diseases and risk factors: 2020 and beyond</article-title><source>J Am Coll Cardiol</source><year>2019</year><month>11</month><day>19</day><volume>74</volume><issue>20</issue><fpage>2529</fpage><lpage>2532</lpage><pub-id pub-id-type="doi">10.1016/j.jacc.2019.10.009</pub-id><pub-id pub-id-type="medline">31727292</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frangogiannis</surname><given-names>NG</given-names> </name></person-group><article-title>The significance of COVID-19-associated myocardial injury: how overinterpretation of scientific findings can fuel media sensationalism and spread misinformation</article-title><source>Eur Heart J</source><year>2020</year><month>10</month><day>14</day><volume>41</volume><issue>39</issue><fpage>3836</fpage><lpage>3838</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehaa727</pub-id><pub-id pub-id-type="medline">33006608</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duffourc</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gerke</surname><given-names>S</given-names> </name></person-group><article-title>Generative AI in health care and liability risks for physicians and safety concerns for patients</article-title><source>JAMA</source><year>2023</year><month>07</month><day>25</day><volume>330</volume><issue>4</issue><fpage>313</fpage><lpage>314</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.9630</pub-id><pub-id pub-id-type="medline">37410497</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reddy</surname><given-names>S</given-names> </name></person-group><article-title>Evaluating large language models for use in healthcare: a framework for translational value assessment</article-title><source>Inform Med Unlocked</source><year>2023</year><month>07</month><volume>41</volume><fpage>101304</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2023.101304</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jo</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Seo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>N</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name></person-group><article-title>Online information analysis on pancreatic cancer in Korea using structural topic model</article-title><source>Sci Rep</source><year>2022</year><month>06</month><day>23</day><volume>12</volume><issue>1</issue><fpage>10622</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-14506-1</pub-id><pub-id pub-id-type="medline">35739151</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jo</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Online information exchange and anxiety spread in the early stage of the novel coronavirus (COVID-19) outbreak in South Korea: structural topic model and network analysis</article-title><source>J Med Internet Res</source><year>2020</year><month>06</month><day>2</day><volume>22</volume><issue>6</issue><fpage>e19455</fpage><pub-id pub-id-type="doi">10.2196/19455</pub-id><pub-id pub-id-type="medline">32463367</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>A pre-trained BERT for Korean medical natural language processing</article-title><source>Sci Rep</source><year>2022</year><month>08</month><day>16</day><volume>12</volume><issue>1</issue><fpage>13847</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-17806-8</pub-id><pub-id pub-id-type="medline">35974113</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Verma</surname><given-names>RM</given-names> </name></person-group><article-title>Can machines tell stories? A comparative study of deep neural language models and metrics</article-title><source>IEEE Access</source><year>2020</year><month>09</month><volume>8</volume><fpage>181258</fpage><lpage>181292</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.3023421</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Towards unifying pre-trained language models for semantic text exchange</article-title><source>Wireless Netw</source><year>2023</year><month>07</month><pub-id pub-id-type="doi">10.1007/s11276-023-03439-w</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Erabi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name></person-group><article-title>Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: comparison study</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>29</day><volume>9</volume><fpage>e48002</fpage><pub-id pub-id-type="doi">10.2196/48002</pub-id><pub-id pub-id-type="medline">37384388</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name></person-group><article-title>Performance and exploration of ChatGPT in medical examination, records and education in Chinese: pave the way for medical AI</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105173</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105173</pub-id><pub-id pub-id-type="medline">37549499</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Standards for evaluating medical questions and answers.</p><media xlink:href="mededu_v10i1e51282_app1.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material></app-group></back></article>