<?xml version="1.0" encoding="UTF-8"?>
<?latexml searchpaths="/home/japhy/scienceReplication.artiswrong.com/paper_files/arxiv/2602.13979/latex_extracted"?>
<?latexml class="IEEEtran" options="conference"?>
<!--  %The preceding line is only needed to identify funding in the first footnote. If that is unneeded, please comment it out. --><?latexml package="cite"?>
<?latexml package="amsmath,amssymb,amsfonts"?>
<?latexml package="algorithmic"?>
<?latexml package="graphicx"?>
<?latexml package="textcomp"?>
<?latexml package="booktabs"?>
<?latexml package="multirow"?>
<?latexml package="xcolor"?>
<?latexml package="listings"?>
<?latexml package="xcolor"?>
<!--  %**** conference˙101719.tex Line 25 **** --><!--  %← 不显示空格下划线！ --><!--  %← 调整字距，避免拥挤 --><!--  %← 保留空格但不显示符号 --><?latexml package="hyperref" options="hidelinks"?>
<?latexml RelaxNGSchema="LaTeXML"?>
<document xmlns="http://dlmf.nist.gov/LaTeXML" class="ltx_authors_1line">
  <resource src="LaTeXML.css" type="text/css"/>
  <resource src="ltx-article.css" type="text/css"/>
  <resource src="ltx-listings.css" type="text/css"/>
  <title>Chain-of-Thought Reasoning with Large Language Models for Clinical Alzheimer’s Disease Assessment and Diagnosis</title>
  <creator role="author">
    <personname><tabular vattach="middle">
        <tbody>
          <tr>
            <td align="center"><inline-para class="ltx_minipage" vattach="top" width="130.1pt">
                <para align="center" xml:id="p1">
                  <p>Tongze Zhang  <ref class="ltx_href" href="https://orcid.org/0000-0002-3375-7136"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p1.g1"/></ref></p>
                  <p><text font="italic">Stevens Institute of Technology</text></p>
                  <p>Hoboken, New Jersey</p>
                </para>
              </inline-para></td>
            <td align="center"><inline-para class="ltx_minipage" vattach="top" width="130.1pt">
                <para align="center" xml:id="p2">
                  <p>Jun-En Ding  <ref class="ltx_href" href="https://orcid.org/0000-0002-1233-138X"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p2.g1"/></ref></p>
                  <p><text font="italic">Stevens Institute of Technology</text></p>
                  <p>Hoboken, New Jersey</p>
                </para>
              </inline-para></td>
            <td align="center"><inline-para class="ltx_minipage" vattach="top" width="130.1pt">
                <para align="center" xml:id="p3">
                  <p>Melik Ozolcer  <ref class="ltx_href" href="https://orcid.org/0000-0003-4251-0204"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p3.g1"/></ref></p>
                  <p><text font="italic">Stevens Institute of Technology</text></p>
                  <p>Hoboken, New Jersey</p>
                </para>
              </inline-para></td>
          </tr>
          <tr>
            <td align="center"><inline-para class="ltx_minipage" vattach="top" width="130.1pt">
                <para align="center" xml:id="p4">
                  <p>Fang-Ming Hung  <ref class="ltx_href" href="https://orcid.org/0000-0003-3501-5459"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p4.g1"/></ref></p>
                  <p><text font="italic">Surgical Trauma Intensive Care Unit</text></p>
                  <p>Far Eastern Memorial Hospital</p>
                </para>
              </inline-para></td>
            <td align="center"><inline-para class="ltx_minipage" vattach="top" width="130.1pt">
                <para align="center" xml:id="p5">
                  <p>Albert Chih-Chieh Yang  <ref class="ltx_href" href="https://orcid.org/0000-0003-2794-9649"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p5.g1"/></ref></p>
                  <p><text font="italic">Institute of Brain Science</text></p>
                  <p>National Yang Ming Chiao Tung University</p>
                </para>
              </inline-para></td>
            <td align="center"><inline-para class="ltx_minipage" vattach="top" width="130.1pt">
                <para align="center" xml:id="p6">
                  <p>Feng Liu  <ref class="ltx_href" href="https://orcid.org/0000-0002-5225-8199"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p6.g1"/></ref></p>
                  <p><text font="italic">Stevens Institute of Technology</text></p>
                  <p>Hoboken, New Jersey</p>
                </para>
              </inline-para></td>
          </tr>
          <tr>
            <td align="center" colspan="3"><inline-para class="ltx_minipage" vattach="top" width="147.4pt">
                <para align="center" xml:id="p7">
                  <p>Yi-Rou Ji  <graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p7.g1"/></p>
                  <p><text font="italic">Surgical Trauma Intensive Care Unit</text></p>
                  <p>National Yang Ming Chiao Tung University</p>
                </para>
              </inline-para>
       
<inline-para class="ltx_minipage" vattach="top" width="147.4pt">
                <para align="center" xml:id="p8">
                  <p>Sang Won Bae* <ref class="ltx_href" href="https://orcid.org/0000-0002-2047-1358"><graphics candidates="graph/orcid.png" graphic="graph/orcid.png" options="scale=0.06" xml:id="p8.g1"/></ref></p>
                  <p><text font="italic">Stevens Institute of Technology</text></p>
                  <p>Hoboken, New Jersey</p>
                </para>
              </inline-para></td>
          </tr>
        </tbody>
      </tabular>
</personname>
  </creator>
  <abstract name="Abstract">
    <p>Alzheimer’s disease (AD) has become a prevalent neurodegenerative disease worldwide. Traditional diagnosis still relies heavily on medical imaging and clinical assessment by physicians, which is often time-consuming and resource-intensive in terms of both human expertise and healthcare resources. In recent years, large language models (LLMs) have been increasingly applied to the medical field using electronic health records (EHRs), yet their application in Alzheimer’s disease assessment remains limited, particularly given that AD involves complex multifactorial etiologies that are difficult to observe directly through imaging modalities. In this work, we propose leveraging LLMs to perform Chain-of-Thought (CoT) reasoning on patients’ clinical EHRs. Unlike direct fine-tuning of LLMs on EHR data for AD classification, our approach utilizes LLM-generated CoT reasoning paths to provide the model with explicit diagnostic rationale for AD assessment, followed by structured CoT-based predictions. This pipeline not only enhances the model’s ability to diagnose intrinsically complex factors but also improves the interpretability of the prediction process across different stages of AD progression. Experimental results demonstrate that the proposed CoT-based diagnostic framework significantly enhances stability and diagnostic performance across multiple CDR grading tasks, achieving up to a 15% improvement in F1 score compared to the zero-shot baseline method.</p>
  </abstract>
  <keywords>
Alzheimer’s Disease, Large Language Models, Chain-of-Thought Reasoning, Clinical Decision Support, Electronic Health Records, Neurodegenerative Disorders
</keywords>
<!--  %“title–Conference Paper Title*““ 
     %–“footnotesize “textsuperscript–*˝Note: Sub-titles are not captured in Xplore and
     %should not be used˝
     %“thanks–Identify applicable funding agency here. If none, delete this.˝
     %˝
     %“author–“IEEEauthorblockN–Tongze Zhang “href–https://orcid.org/0000-0002-3375-7136˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–Stevens Institute of Technology˝ ““
     %“textit–Stevens Institute of Technology˝““
     %Hoboken, New Jersey ˝
     %“and
     %**** conference˙101719.tex Line 50 ****
     %“IEEEauthorblockN–Jun-En Ding “href–https://orcid.org/0000-0002-1233-138X˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–dept. name of organization (of Aff.)˝ ““
     %“textit–Stevens Institute of Technology˝““
     %Hoboken, New Jersey ˝
     %“and
     %“IEEEauthorblockN–Melik Ozolcer “href–https://orcid.org/0000-0003-4251-0204˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–dept. name of organization (of Aff.)˝ ““
     %“textit–Stevens Institute of Technology˝““
     %Hoboken, New Jersey˝
     %“and
     %“IEEEauthorblockN–Fang-MingΨHung “href–https://orcid.org/0000-0003-3501-5459˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–dept. name of organization (of Aff.)˝ ““
     %“textit–Surgical Trauma Intensive Care Unit˝““
     %Far Eastern Memorial Hospital˝
     %“and
     %“IEEEauthorblockN–Albert Chih-Chieh Yang “href–https://orcid.org/0000-0003-2794-9649˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–dept. name of organization (of Aff.)˝ ““
     %“textit–Institute of Brain Science˝““
     %National Yang Ming Chiao Tung University˝
     %“and
     %**** conference˙101719.tex Line 75 ****
     %“IEEEauthorblockN–Feng Liu “href–https://orcid.org/0000-0002-5225-8199˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–dept. name of organization (of Aff.)˝ ““
     %“textit–Stevens Institute of Technology˝““
     %Hoboken, New Jersey˝
     %“and
     %“IEEEauthorblockN–Yi-Rou Ji “href–˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–dept. name of organization (of Aff.)˝ ““
     %“textit–Surgical Trauma Intensive Care Unit˝““
     %National Yang Ming Chiao Tung University˝
     %“and
     %“IEEEauthorblockN–*Sang Won Bae “href–https://orcid.org/0000-0002-2047-1358˝–“includegraphics[scale=0.06]–graph/orcid.png˝˝˝
     %“IEEEauthorblockA–%“textit–Stevens Institute of Technology˝ ““
     %“textit–Stevens Institute of Technology˝““
     %Hoboken, New Jersey˝
     %˝
     %================= Row 1: 3 authors =================
     %**** conference˙101719.tex Line 100 ****
     %================= Row 2: 3 authors =================
     %**** conference˙101719.tex Line 125 ****
     %================= Row 3: 2 authors centered =================
     %**** conference˙101719.tex Line 150 ****-->  <section inlist="toc" xml:id="S1">
    <tags>
      <tag>I</tag>
      <tag role="autoref">section I</tag>
      <tag role="refnum">I</tag>
      <tag role="typerefnum">§I</tag>
    </tags>
    <title><tag close=" ">I</tag><text font="smallcaps">INTRODUCTION</text></title>
    <para xml:id="S1.p1">
      <p>Alzheimer’s disease (AD) is the most prevalent neurodegenerative disorder worldwide, posing a growing challenge to healthcare systems and patients’ quality of life. The rising global prevalence of AD creates an urgent need for accurate, scalable, and resource-efficient diagnostic tools. Traditionally, standard diagnostic protocols remain heavily reliant on a combination of time-consuming and costly methods, including advanced medical imaging modalities such as PET and structural MRI, alongside clinical assessments by specialized physicians <cite class="ltx_citemacro_cite">[<bibref bibrefs="ding2025variational" separator="," yyseparator=","/>]</cite><cite class="ltx_citemacro_cite">[<bibref bibrefs="alia2024daily" separator="," yyseparator=","/>]</cite>. This high dependence on human expertise and expensive infrastructure limits accessibility.</p>
    </para>
    <para xml:id="S1.p2">
      <p><text color="#000000">Alzheimer’s disease is characterized by progressive impairment of cognitive and functional behavior, encompassing memory, orientation, judgement, and the ability to perform daily activities. Such behavioral changes are typically documented within the clinical narratives of electronic health records (EHRs), where clinicians describe patients’ functional status, behavioral symptoms, and activities of daily living using natural language. The Clinical Dementia Rating (CDR) serves as a standardized clinical assessment tool <cite class="ltx_citemacro_cite">[<bibref bibrefs="li2025care" separator="," yyseparator=","/>]</cite>, synthesizing multidimensional cognitive and functional behavioral performance.</text></p>
    </para>
    <para xml:id="S1.p3">
      <p>In recent years, the integration of large language models (LLMs) with EHR has transformed predictive analytics in medicine. LLMs are increasingly applied to diverse tasks ranging from clinical document summarization to patient outcome prediction. However, their direct application to inherently multifactorial complex diseases like AD remains constrained. Despite advances in LLMs for understanding medical texts, their application in diagnostic decision-making remains somewhat limited. Traditional LLM fine-tuning often produces black-box classifiers with limited interpretability and traceability, constraining their clinical adoption <cite class="ltx_citemacro_cite">[<bibref bibrefs="du2025testing" separator="," yyseparator=","/>]</cite>.</p>
    </para>
    <para xml:id="S1.p4">
      <p>To bridge this critical gap in explainability and complex reasoning, we propose a novel diagnostic workflow leveraging Chain-of-Thought (CoT) reasoning within LLMs for AD assessment based on comprehensive clinical EHR data <cite class="ltx_citemacro_cite">[<bibref bibrefs="lucas2024reasoning" separator="," yyseparator=","/>]</cite>. Our approach achieves this by explicitly generating intermediate diagnostic reasoning. These LLM-generated CoT paths are designed to mimic the explicit, stepwise logical reasoning process employed by clinical experts, transforming heterogeneous EHR features into structured, verifiable explanations before arriving at a final prediction. By integrating this explicit reasoning layer, the model demonstrates enhanced capability in handling the intrinsically complex factors defining AD pathogenesis and progression. The results demonstrate that incorporating structured reasoning significantly enhances diagnostic consistency and interpretability while maintaining performance. This research highlights the potential of CoT-enhanced large language models to bridge automated prediction with interpretable clinical reasoning, laying the foundation for trustworthy AI-assisted diagnostic systems in neurodegenerative disease research. Existing research primarily relies on longitudinal clinical records to predict whether or when Alzheimer’s disease will occur. In contrast, this study focuses on clinical staging through the grading of CDR within current electronic health record texts, emphasizing interpretable, real-time assessments to support clinical decision-making. <text color="#000000">This work presents a structured, multi-stage Chain-of-Thought reasoning framework for interpretable and stable behavioral assessment from clinical narratives using large language models.</text></p>
    </para>
    <para xml:id="S1.p5">
      <p>To address the current lack of interpretability and reliability in language models for Alzheimer’s disease (AD) diagnosis, this study primarily explores the following three key questions:</p>
    </para>
<!--  %1. Can large language models (LLMs) effectively analyze unstructured electronic health record (EHR) text to identify different stages of Alzheimer’s disease? 
     %2. Can the Chain-of-Thought (CoT) reasoning mechanism enhance the interpretability and credibility of LLM-based AD diagnostic models while maintaining performance?
     %3. How does multi-level reasoning simulate real clinical consultation processes to strengthen the model’s decision stability and verifiability?-->    <para xml:id="S1.p6">
      <p>1. Will LLM reliably identify subtle CDR grading differences from unstructured EHRs?</p>
    </para>
    <para xml:id="S1.p7">
      <p>2. Can incorporating CoT reasoning enhance interpretability and credibility?</p>
    </para>
<!--  %**** Introduce.tex Line 25 **** -->    <para xml:id="S1.p8">
      <p>3. Can multi-stage reasoning structures reduce performance fluctuations across different CDR grading tasks, thereby enhancing prediction consistency?</p>
    </para>
  </section>
  <section inlist="toc" xml:id="S2">
    <tags>
      <tag>II</tag>
      <tag role="autoref">section II</tag>
      <tag role="refnum">II</tag>
      <tag role="typerefnum">§II</tag>
    </tags>
    <title><tag close=" ">II</tag><text font="smallcaps">Related Work</text></title>
    <subsection inlist="toc" xml:id="S2.SS1">
      <tags>
        <tag>II-A</tag>
        <tag role="autoref">subsection II-A</tag>
        <tag role="refnum">II-A</tag>
        <tag role="typerefnum">§II-A</tag>
      </tags>
      <title><tag close=" ">II-A</tag><text font="italic">Traditional Approaches to Alzheimer’s Disease Diagnosis</text></title>
      <para xml:id="S2.SS1.p1">
        <p>The diagnosis of Alzheimer’s disease (AD) has traditionally relied upon neuropsychological assessments, imaging examinations (such as MRI and PET), and cerebrospinal fluid biomarkers <cite class="ltx_citemacro_cite">[<bibref bibrefs="ding2025variational" separator="," yyseparator=","/>]</cite>. Whilst these methods possess clinical value, they are costly, invasive, and difficult to deploy at scale for screening purposes <cite class="ltx_citemacro_cite">[<bibref bibrefs="fikry2022modelling" separator="," yyseparator=","/>]</cite>. In recent years, machine learning and deep learning approaches have been applied to neuroimaging data for automated AD detection and disease progression prediction. For instance, convolutional neural networks (CNNs) applied to MRI or PET scans have achieved high accuracy in distinguishing mild cognitive impairment (MCI) from AD <cite class="ltx_citemacro_cite">[<bibref bibrefs="wang2025flexible" separator="," yyseparator=","/>]</cite><cite class="ltx_citemacro_cite">[<bibref bibrefs="dao2025curenet" separator="," yyseparator=","/>]</cite>. However, these models often exhibit “black box” behaviour, lacking transparent reasoning pathways. Complementing imaging research, natural language processing (NLP) methods have also been applied to EHR texts: analysing narrative clinical notes, cognitive test summaries, and EHR time-series data can detect cognitive decline and predict conversion to AD <cite class="ltx_citemacro_cite">[<bibref bibrefs="amini2024prediction" separator="," yyseparator=","/>]</cite>. However, many NLP-based systems rely on bag-of-words models, sequence classification, or fine-tuned Transformer models. While these approaches capture surface-level patterns, they fail to explicitly model clinical reasoning processes or achieve interpretability. Basic clinical staging and biomarker frameworks further underscore the importance of mild cognitive impairment as the biological precursor stage of Alzheimer’s disease. These findings have guided the design of recent machine learning models <cite class="ltx_citemacro_cite">[<bibref bibrefs="petersen2011mild" separator="," yyseparator=","/>]</cite><cite class="ltx_citemacro_cite">[<bibref bibrefs="jack2018nia" separator="," yyseparator=","/>]</cite>. Systematic reviews and meta-analyses of MRI/PET-based machine learning systems have reconfirmed performance improvements while simultaneously demonstrating comparability and data leakage risks within convolutional neural network pipelines <cite class="ltx_citemacro_cite">[<bibref bibrefs="wen2020convolutional" separator="," yyseparator=","/>]</cite><cite class="ltx_citemacro_cite">[<bibref bibrefs="battineni2024machine" separator="," yyseparator=","/>]</cite>.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S2.SS2">
      <tags>
        <tag>II-B</tag>
        <tag role="autoref">subsection II-B</tag>
        <tag role="refnum">II-B</tag>
        <tag role="typerefnum">§II-B</tag>
      </tags>
      <title><tag close=" ">II-B</tag><text font="italic">Large Language Models in Medical Text Understanding</text></title>
      <para xml:id="S2.SS2.p1">
        <p>The advent of LLMs, such as ClinicalBERT, LLaMA, and domain-adapted models, has significantly advanced medical natural language processing. Recent reviews on LLMs in healthcare highlight their broad applicability across tasks including entity extraction, report summarisation, and outcome prediction <cite class="ltx_citemacro_cite">[<bibref bibrefs="nazi2024large" separator="," yyseparator=","/>]</cite>. Research indicates that when applied to clinical texts, LLMs surpass earlier smaller models, demonstrating potential for diagnostic reasoning <cite class="ltx_citemacro_cite">[<bibref bibrefs="lievin2024can" separator="," yyseparator=","/>]</cite>. Despite these advances, however, many LLM-based clinical systems suffer from opacity issues: they provide only classification results or recommendations without displaying intermediate reasoning steps, thereby limiting clinician trust and interpretability.</p>
      </para>
      <para xml:id="S2.SS2.p2">
        <p>Recent clinical text studies on LLMs have primarily focused on information extraction and classification tasks, typically providing only classification results or evidence snippets without systematic generative clinical reasoning chains. Simultaneously, many validation studies suffer from small data scales (e.g., evaluating a limited number of manually reviewed cases) or are constrained by single-center, controlled settings, making it difficult to cover complex real-world clinical contexts <cite class="ltx_citemacro_cite">[<bibref bibrefs="zhang2024evaluating" separator="," yyseparator=","/>]</cite>. Beyond early domain models such as ClinicalBERT <cite class="ltx_citemacro_cite">[<bibref bibrefs="huang2019clinicalbert" separator="," yyseparator=","/>]</cite>, recent studies indicate that LLMs with safety alignment mechanisms and instruction-based fine-tuning—such as the Med-PaLM series—can encode extensive clinical knowledge yet still underperform compared to clinicians in fine-grained tasks <cite class="ltx_citemacro_cite">[<bibref bibrefs="singhal2023large" separator="," yyseparator=","/>]</cite><cite class="ltx_citemacro_cite">[<bibref bibrefs="singhal2025toward" separator="," yyseparator=","/>]</cite>. Randomized trials also indicate that providing large language model assistance does not uniformly enhance physicians’ diagnostic reasoning capabilities, underscoring the necessity of establishing transparent, auditable workflows <cite class="ltx_citemacro_cite">[<bibref bibrefs="goh2024large" separator="," yyseparator=","/>]</cite>. Our approach addresses this by designing an auditable multi-stage reasoning template that generates clinically readable intermediate conclusions and final consolidated opinions for each record. This aims to enhance interpretability and clinical utility while maintaining performance.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S2.SS3">
      <tags>
        <tag>II-C</tag>
        <tag role="autoref">subsection II-C</tag>
        <tag role="refnum">II-C</tag>
        <tag role="typerefnum">§II-C</tag>
      </tags>
      <title><tag close=" ">II-C</tag><text font="italic">Chain-of-Thought (CoT) Reasoning and Explainable AI in Medicine</text></title>
      <para xml:id="S2.SS3.p1">
        <p>CoT prompts represent a recently developed methodology within large language model research, requiring models to articulate intermediate reasoning steps rather than directly outputting answers <cite class="ltx_citemacro_cite">[<bibref bibrefs="wei2022chain" separator="," yyseparator=","/>]</cite>. This stepwise reasoning approach has demonstrated improved performance on arithmetic and common-sense benchmarks. Within the medical domain, CoT has been explored to enhance the interpretability, auditability, and coordination with clinicians of large language model decision-making <cite class="ltx_citemacro_cite">[<bibref bibrefs="miao2024chain" separator="," yyseparator=","/>]</cite>. For instance, a renal disease diagnosis study demonstrated that CoT prompts enable LLMs to expose decision pathways and facilitate error tracing <cite class="ltx_citemacro_cite">[<bibref bibrefs="miao2024chain" separator="," yyseparator=","/>]</cite>. Recent structured clinical reasoning prompts—incorporating differential reasoning, analytical reasoning, and Bayesian inference frameworks—further expand LLM potential in medical tasks <cite class="ltx_citemacro_cite">[<bibref bibrefs="sonoda2025structured" separator="," yyseparator=","/>]</cite>. Despite these advances, CoT reasoning applications in neurodegenerative diseases like Alzheimer’s remain in their exploratory infancy. Recent work proposes a CoT-based Alzheimer’s disease classification method, achieving approximately 16.7% performance improvement through supervised fine-tuning with reasoning cues <cite class="ltx_citemacro_cite">[<bibref bibrefs="park2025reasoning" separator="," yyseparator=","/>]</cite>. Moreover, emerging research employing multi-agent large language model frameworks for early Alzheimer’s detection from longitudinal clinical records highlights a trend towards simulating expert consultation workflows <cite class="ltx_citemacro_cite">[<bibref bibrefs="li2025care" separator="," yyseparator=","/>]</cite>. These research gaps motivated our present work: we propose a multi-stage CoT diagnostic system specifically designed for Alzheimer’s assessment. By integrating structured reasoning with consensus-building mechanisms, it significantly enhances interpretability, consistency, and clinical relevance.</p>
      </para>
      <para xml:id="S2.SS3.p2">
        <p>Strengthening CoT by sampling diverse reasoning paths and aggregating consensus provides a principled approach to reducing variability in clinical reasoning chains <cite class="ltx_citemacro_cite">[<bibref bibrefs="wang2022self" separator="," yyseparator=","/>]</cite>. CoT-based diagnostic reasoning has also demonstrated improved interpretability in controlled clinical benchmarks <cite class="ltx_citemacro_cite">[<bibref bibrefs="savage2024diagnostic" separator="," yyseparator=","/>]</cite>, while healthcare multi-agent frameworks based on longitudinal medical records have proven the feasibility of simulating collaborative team-based diagnosis and treatment processes <cite class="ltx_citemacro_cite">[<bibref bibrefs="li2025care" separator="," yyseparator=","/>]</cite>. Research on CoT and structured diagnostic prompts in medical settings has been explored, but most work remains focused on promoting accurate classification rather than generating comprehensive, verifiable explanatory texts based on clinical semantics <cite class="ltx_citemacro_cite">[<bibref bibrefs="savage2024diagnostic" separator="," yyseparator=","/>]</cite>. Additionally, existing studies often evaluate performance on limited-scale tasks or benchmarks. Our design addresses the challenges of small-sample and out-of-domain generalization while ensuring interpretability.</p>
      </para>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S3">
    <tags>
      <tag>III</tag>
      <tag role="autoref">section III</tag>
      <tag role="refnum">III</tag>
      <tag role="typerefnum">§III</tag>
    </tags>
    <title><tag close=" ">III</tag><text font="smallcaps">Method</text></title>
    <para xml:id="S3.p1">
      <p>This study aims to construct an interpretable automated CDR grading system based on a two-stage experimental framework. The first stage focuses on establishing multiple high-performance classification baselines to provide impartial performance benchmarks. The second stage introduces the core innovation that a CoT Alzheimer’s Disease Diagnostic system to address the transparency issues of traditional black-box models in complex clinical reasoning.</p>
    </para>
    <subsection inlist="toc" xml:id="S3.SS1">
      <tags>
        <tag>III-A</tag>
        <tag role="autoref">subsection III-A</tag>
        <tag role="refnum">III-A</tag>
        <tag role="typerefnum">§III-A</tag>
      </tags>
      <title><tag close=" ">III-A</tag><text font="italic">Data Preprocessing</text></title>
      <table inlist="lot" labels="LABEL:tab:cdr_real_samples" placement="htbp" xml:id="S3.T1">
        <tags>
          <tag>TABLE I</tag>
          <tag role="autoref">Table I</tag>
          <tag role="refnum">I</tag>
          <tag role="typerefnum">TABLE I</tag>
        </tags>
        <toccaption class="ltx_centering"><tag close=" ">I</tag>Representative real-world CDR-labeled clinical records showing the complexity of Subjective (S) and Assessment (A) texts.</toccaption>
        <caption class="ltx_centering"><tag close=": ">TABLE I</tag>Representative real-world CDR-labeled clinical records showing the complexity of Subjective (S) and Assessment (A) texts.</caption>
        <tabular class="ltx_centering ltx_guessed_headers" vattach="middle">
          <thead>
            <tr>
              <td align="justify" border="t" thead="column" width="22.8pt"><text class="ltx_wrap" font="bold">CDR</text></td>
              <td align="justify" border="t" thead="column" width="204.9pt"><text class="ltx_wrap" font="bold">Subjective Note (S)</text></td>
              <td align="justify" border="t" thead="column" width="204.9pt"><text class="ltx_wrap" font="bold">Assessment (A)</text></td>
            </tr>
          </thead>
          <tbody>
            <tr>
              <td align="justify" border="t" width="22.8pt">0.5</td>
              <td align="justify" border="t" width="204.9pt">“insidious onset with progressive poor memory forgets conversation details but able to manage home affairs; occasional confusion reported by spouse.”</td>
              <td align="justify" border="t" width="204.9pt">“Autistic thinking (+) vague responses; mild impairment in orientation and memory domains; functional independence maintained.”</td>
            </tr>
            <tr>
              <td align="justify" border="t" width="22.8pt">1.0</td>
              <td align="justify" border="t" width="204.9pt">“WITH daughter deterioration multiple complaints of forgetfulness, misplacing items, and poor concentration; sometimes fails to find way home.”</td>
              <td align="justify" border="t" width="204.9pt">“Suspect depression (treated at Psyche) reports, mild to moderate decline, impaired attention span, partial insight preserved.”</td>
            </tr>
            <tr>
              <td align="justify" border="t" width="22.8pt">2.0</td>
              <td align="justify" border="t" width="204.9pt">“progressive for years with poor memory Forgetfulness noted by family members and reduced self-care; occasional urinary incontinence reported.”</td>
              <td align="justify" border="t" width="204.9pt">“(favor) in progression with incontinence? cognitive decline with temporal disorientation, impaired judgment, and dependency for daily activities.”</td>
            </tr>
            <tr>
              <td align="justify" border="b t" width="22.8pt">3.0</td>
              <td align="justify" border="b t" width="204.9pt">“request application for disability certificate due to long-term confusion, unable to recognize relatives; total dependence for self-care.”</td>
              <td align="justify" border="b t" width="204.9pt">“Right PCA territory infarct (onset: )(TOAST type: ) consistent with severe dementia picture; bed bound, nonverbal, requires full assistance.”</td>
            </tr>
          </tbody>
        </tabular>
<!--  %**** Method.tex Line 25 **** -->      </table>
      <para xml:id="S3.SS1.p1">
        <p>The raw data used in this study originated from a five year clinical dataset containing patients’ longitudinal EHR, primarily comprising the patient’s Subject (S) and clinical diagnostic Assessment (A) fields. To ensure data quality and analytical validity, we first executed a rigorous data cleaning process. Since multiple records might exist for the same patient across different time points, we utilized unique medical record identifiers and performed deduplication based on the longest text length principle. The initial dataset contained 745 raw patient records. Following preprocessing, all samples with empty assessment (A) fields were excluded, yielding 698 distinct and complete patient records suitable for subsequent modeling. The CDR labels in our dataset cover four clinically recognized Alzheimer’s disease severity levels: 0.5, 1.0, 2.0, and 3.0, corresponding to very mild, mild, moderate, and severe dementia, respectively. Each stage exhibits distinct functional characteristics: CDR 0.5 patients typically show mild memory deficits while maintaining daily independence; CDR 1.0 patients exhibit more pronounced cognitive and functional impairments; those with 2.0 require assistance for daily tasks; and those with 3.0 exhibit severe disorientation and complete dependence on caregivers.</p>
      </para>
<!--  %This study focuses on patients with CDR scores ranging from 0.5 to 3.0, representing the progressive stages of Alzheimer’s disease from very mild to mild, moderate, and severe dementia. 
     %In experimental design, we adopted a systematic “one-versus-one” binary classification strategy, decomposing the four primary CDR severity levels (0.5, 1.0, 2.0, 3.0) into four independent binary classification tasks. This allows the model to perform more precise discrimination assessments on subtle clinical differences. For each task, the system dynamically selects corresponding patient groups from the dataset and splits them into an 80“% training set and a 20% test set.
     %The experimental design employs a systematic “one-versus-one” binary classification strategy, decomposing the complex multi-class CDR scoring problem (CDR 0.5, 1, 2, 3) into a series of more focused and controllable binary classification tasks (0.5 vs 1.0, 0.5 vs 2.0, 0.5 vs 3.0, 1.0 vs 3.0). For each classification task pair, the system dynamically selects corresponding patient cohorts from the preprocessed dataset and splits them into training and testing sets at an 80/20 ratio. This design not only simplifies the model’s decision boundaries but also enables more targeted and in-depth performance evaluation in specific differential diagnosis scenarios. This systematically explores the model’s subtle discrimination capabilities across varying dementia severity levels.-->      <para xml:id="S3.SS1.p2">
        <p>This experimental design employs a systematic one-versus-one binary classification strategy, decomposing the complex multi-class CDR problem into a series of more specific and controllable diagnostic subtasks. The CDR scale is a widely recognized clinical standard for quantifying the degree of cognitive decline across multifunctional domains such as memory, orientation, judgment, and self-care.</p>
      </para>
      <para xml:id="S3.SS1.p3">
        <p>To ensure consistency in assessment across different levels of disease severity, we constructed four binary classification subsets from the final dataset: 0.5 vs 1.0, 0.5 vs 2.0, 0.5 vs 3.0, and 1.0 vs 3.0. The 0.5 vs 1.0 subset contained 429 patient records, the 0.5 vs 2.0 subset contained 340 patient records, the 0.5 vs 3.0 subset contained 263 patient records, and the 1.0 vs 3.0 subset contained 358 patient records. Each subset was independently processed through the proposed CoT inference framework to evaluate diagnostic performance across varying degrees of cognitive impairment. Representative samples of these records are presented in Table <ref labelref="LABEL:tab:cdr_real_samples"/>. Each binary experiment focused on adjacent or clinically significant stage differences, enabling the system to precisely identify diagnostic boundaries. The Clinical Dementia Rating (CDR) itself reflects the progressive deterioration of multidimensional functions in Alzheimer’s disease patients, including memory, orientation, judgment, and self-care abilities. Therefore, the 0.5 vs 1.0 and 0.5 vs 2.0 comparisons assess whether the model can detect subtle early-stage differences in disease progression, while the 0.5 vs 3.0 and 1.0 vs 3.0 comparisons validate the model’s robustness across scenarios with significant functional gaps. Compared to directly constructing a multi-class classification model, decomposing the task into binary sub-tasks corresponding to clinically meaningful decision boundaries reduces label ambiguity and enhances interpretability. This grouping approach thus possesses clear medical significance while providing the model with well-defined, verifiable classification objectives. This enables systematic evaluation of reasoning stability and discriminative capability across different levels of cognitive decline.</p>
      </para>
<!--  %For instance, the 0.5 vs 1.0 task assessed the model’s sensitivity to early pathological changes between very mild and mild dementia, while the 0.5 vs 3.0 setting tested its robustness in identifying severe cognitive decline across the entire spectrum. -->      <para xml:id="S3.SS1.p4">
        <p>For each paired task, the system dynamically extracts patient records corresponding to the two target CDR grades from the preprocessed dataset. These subsets are then divided into training (80%) and test (20%) sets, ensuring balanced representation across severity levels. This design not only simplifies the model’s decision space but also enables targeted evaluation of its reasoning stability and generalization capabilities under diverse clinical contrast conditions. Through this approach, the study systematically explores how the proposed reasoning framework adapts to varying degrees of cognitive impairment, revealing its diagnostic interpretability and sensitivity to disease progression.</p>
      </para>
<!--  %“subsection–Baseline Model Establishment and Evaluation˝ 
     %For each one-versus-one task, we selected multiple models including BioBERT, gemma-2b, LLaMA-7B, and Medical-Llama3-8B as baselines. These baseline models encompass diverse architectures ranging from small-scale to large-scale and from general domains to medical domains, enabling a comprehensive evaluation of various models’ performance potential in clinical EHR text classification tasks. Notably, during training and evaluation of baseline models, we exclusively used the unstructured “Subject (S)” field as input for CDR grading predictions, excluding the “Assessment (A)” field. This design aims to evaluate models’ ability to classify solely based on raw, unedited patient narrative text. During fine-tuning, all baseline models underwent standard sequence classification fine-tuning. To optimize performance and prevent overfitting, early stopping was implemented, with F1 score selected as the primary metric for monitoring and selecting optimal models. Finally, each baseline model was evaluated on an independent 20“% test set for each task.
     %**** Method.tex Line 50 ****-->    </subsection>
    <subsection inlist="toc" xml:id="S3.SS2">
      <tags>
        <tag>III-B</tag>
        <tag role="autoref">subsection III-B</tag>
        <tag role="refnum">III-B</tag>
        <tag role="typerefnum">§III-B</tag>
      </tags>
      <title><tag close=" ">III-B</tag><text font="italic">CoT Diagnostic System: Independent CoT Generation and Reasoning Validation</text></title>
      <figure inlist="lof" labels="LABEL:fig:placeholder" xml:id="S3.F1">
        <tags>
          <tag>Fig. 1</tag>
          <tag role="autoref">Figure 1</tag>
          <tag role="refnum">1</tag>
          <tag role="typerefnum">Fig. 1</tag>
        </tags>
        <graphics candidates="graph/cdr_abc_diagram.jpg" class="ltx_centering" graphic="graph/cdr_abc_diagram.jpg" options="width=433.62pt" xml:id="S3.F1.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">1</tag>Four-Step Pipeline for Binary CDR Classification</toccaption>
        <caption class="ltx_centering"><tag close=": ">Fig. 1</tag>Four-Step Pipeline for Binary CDR Classification</caption>
      </figure>
      <float class="ltx_lstlisting" inlist="lol" labels="LABEL:lst:prompt_structure" xml:id="LST1">
        <tags>
          <tag>Listing 1</tag>
          <tag role="autoref">1</tag>
          <tag role="refnum">1</tag>
          <tag role="typerefnum">Listing 1</tag>
        </tags>
        <toccaption><tag close=" ">1</tag>Chain-of-Thought Prompting Structure for CDR Reasoning</toccaption>
        <caption><tag close=": ">Listing 1</tag>Chain-of-Thought Prompting Structure for CDR Reasoning</caption>
        <listing class="ltx_lst_language_Python ltx_lst_numbers_left ltx_lstlisting" data="U3lzdGVtIHByb21wdDoKIllvdSBhcmUgYW4gZXhwZXJpZW5jZWQgbmV1cm9sb2dpc3Qgc3BlY2lhbGl6aW5nIGluIEFsemhlaW1lcidzIGRpc2Vhc2UuClJlc3BvbmQgcHJvZmVzc2lvbmFsbHkgaW4gRW5nbGlzaCBhbmQgb3V0cHV0IHZhbGlkIEpTT04uIgoKVXNlciBwcm9tcHQ6CiMgVGFzazogR2VuZXJhdGUgYSBmdWxsIGNsaW5pY2FsIGFuYWx5c2lzIGZyb20gYSBzdWJqZWN0aXZlIG5vdGUKSW5wdXQ6ICJQYXRpZW50IHJlcG9ydHMgb2NjYXNpb25hbCBkaXNvcmllbnRhdGlvbi4uLiIKSW5zdHJ1Y3Rpb25zOgoxLiBFeHRyYWN0IGtleSByZWFzb25pbmcgc3RlcHMuCjIuIFByb3ZpZGUgc3RydWN0dXJlZCBkb21haW4tc3BlY2lmaWMgYXNzZXNzbWVudCAoTWVtb3J5LCBPcmllbnRhdGlvbiwgZXRjLikuCjMuIE91dHB1dCBKU09OOgp7CiAgInJlYXNvbmluZ19zdGVwcyI6IFsuLi5dLAogICJhc3Nlc3NtZW50IjogIi4uLiIsCiAgImNkcl9zY29yZSI6ICJbY2hvb3NlIGZyb20gMC41LCAxXSIKfQ==" dataencoding="base64" datamimetype="text/plain" framed="topbottom">
          <listingline xml:id="lstnumberx1"><tags>
              <tag><text font="typewriter" fontsize="50%">1</text></tag>
            </tags><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">System</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">prompt</text><text font="typewriter" fontsize="90%">:</text></listingline>
          <listingline xml:id="lstnumberx2"><tags>
              <tag><text font="typewriter" fontsize="50%">2</text></tag>
            </tags><text font="typewriter" fontsize="90%">"</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">You</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">are</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">an</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">experienced</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">neurologist</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">specializing</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">in</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Alzheimer</text><text font="typewriter" fontsize="90%">’</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">s</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">disease</text><text font="typewriter" fontsize="90%">.</text></listingline>
          <listingline xml:id="lstnumberx3"><tags>
              <tag><text font="typewriter" fontsize="50%">3</text></tag>
            </tags><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Respond</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">professionally</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">in</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">English</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">and</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">output</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">valid</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">JSON</text><text font="typewriter" fontsize="90%">."</text></listingline>
          <listingline xml:id="lstnumberx4"><tags>
              <tag><text font="typewriter" fontsize="50%">4</text></tag>
            </tags></listingline>
          <listingline xml:id="lstnumberx5"><tags>
              <tag><text font="typewriter" fontsize="50%">5</text></tag>
            </tags><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">User</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">prompt</text><text font="typewriter" fontsize="90%">:</text></listingline>
          <listingline xml:id="lstnumberx6"><tags>
              <tag><text font="typewriter" fontsize="50%">6</text></tag>
            </tags><text font="typewriter" fontsize="90%">#</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Task</text><text font="typewriter" fontsize="90%">:</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Generate</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">a</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">full</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">clinical</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">analysis</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">from</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">a</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">subjective</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">note</text></listingline>
          <listingline xml:id="lstnumberx7"><tags>
              <tag><text font="typewriter" fontsize="50%">7</text></tag>
            </tags><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Input</text><text font="typewriter" fontsize="90%">:</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">"</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Patient</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">reports</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">occasional</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">disorientation</text><text font="typewriter" fontsize="90%">..."</text></listingline>
          <listingline xml:id="lstnumberx8"><tags>
              <tag><text font="typewriter" fontsize="50%">8</text></tag>
            </tags><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Instructions</text><text font="typewriter" fontsize="90%">:</text></listingline>
          <listingline xml:id="lstnumberx9"><tags>
              <tag><text font="typewriter" fontsize="50%">9</text></tag>
            </tags><text font="typewriter" fontsize="90%">1.</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Extract</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">key</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">reasoning</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">steps</text><text font="typewriter" fontsize="90%">.</text></listingline>
          <listingline xml:id="lstnumberx10"><tags>
              <tag><text font="typewriter" fontsize="50%">10</text></tag>
            </tags><text font="typewriter" fontsize="90%">2.</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Provide</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">structured</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">domain</text><text font="typewriter" fontsize="90%">-</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">specific</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">assessment</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">(</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Memory</text><text font="typewriter" fontsize="90%">,</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Orientation</text><text font="typewriter" fontsize="90%">,</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">etc</text><text font="typewriter" fontsize="90%">.).</text></listingline>
          <listingline xml:id="lstnumberx11"><tags>
              <tag><text font="typewriter" fontsize="50%">11</text></tag>
            </tags><text font="typewriter" fontsize="90%">3.</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">Output</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">JSON</text><text font="typewriter" fontsize="90%">:</text></listingline>
          <listingline xml:id="lstnumberx12"><tags>
              <tag><text font="typewriter" fontsize="50%">12</text></tag>
            </tags><text font="typewriter" fontsize="90%">{</text></listingline>
          <listingline xml:id="lstnumberx13"><tags>
              <tag><text font="typewriter" fontsize="50%">13</text></tag>
            </tags><text class="ltx_lst_space" font="typewriter" fontsize="90%">  </text><text font="typewriter" fontsize="90%">"</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">reasoning_steps</text><text font="typewriter" fontsize="90%">":</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">[...],</text></listingline>
          <listingline xml:id="lstnumberx14"><tags>
              <tag><text font="typewriter" fontsize="50%">14</text></tag>
            </tags><text class="ltx_lst_space" font="typewriter" fontsize="90%">  </text><text font="typewriter" fontsize="90%">"</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">assessment</text><text font="typewriter" fontsize="90%">":</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">"...",</text></listingline>
          <listingline xml:id="lstnumberx15"><tags>
              <tag><text font="typewriter" fontsize="50%">15</text></tag>
            </tags><text class="ltx_lst_space" font="typewriter" fontsize="90%">  </text><text font="typewriter" fontsize="90%">"</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">cdr_score</text><text font="typewriter" fontsize="90%">":</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">"[</text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">choose</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text class="ltx_lst_identifier" font="typewriter" fontsize="90%">from</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">0.5,</text><text class="ltx_lst_space" font="typewriter" fontsize="90%"> </text><text font="typewriter" fontsize="90%">1]"</text></listingline>
          <listingline xml:id="lstnumberx16"><tags>
              <tag><text font="typewriter" fontsize="50%">16</text></tag>
            </tags><text font="typewriter" fontsize="90%">}</text></listingline>
        </listing>
      </float>
<!--  %This research framework aims to develop an automated Clinical Dementia Rating (CDR) system based on large language models (LLMs). Its core objective is to achieve precise grading of neurocognitive impairment severity by processing unstructured electronic medical record text. The raw data used in this study originated from a clinical dataset containing longitudinal electronic medical records. To ensure data quality for model input and the validity of analysis, we first deduplicated the data using unique medical record identifiers, retaining only the most representative record for each patient. Subsequently, we addressed missing values in key text fields such as “Subject (S)” and “Assessment (A)”. Finally, to ensure each record contained sufficient clinical information for model analysis, we set a text length threshold and removed records with insufficient information. 
     %The core of the process lies in its unique multi-perspective independent review and consensus mechanism. When processing each individual medical record in the test set, the system does not perform a single inference. Instead, it conducts four completely independent comprehensive analyses on the same “Subject (S)” text. Each analysis generates a structured JSON report containing reasoning steps, clinical assessments, and preliminary CDR scores. This design mimics the real-world practice of experts conducting multiple reviews or seeking second opinions to ensure diagnostic accuracy. Its purpose is to minimize potential random biases inherent in single-model inferences, thereby significantly enhancing the robustness and reliability of the final diagnostic conclusion.
     %After completing four independent evaluations, the system enters a hierarchical decision-making process. First, it extracts the clinical assessment sections from all four reports. A language model then synthesizes these potentially slightly divergent assessment texts into a single, comprehensive, and internally consistent final assessment report. Subsequently, this analytically consensus-based final assessment report is submitted to the CDR Classification module. This module employs a highly structured prompt to guide the model in generating the most authoritative final CDR score judgment based on the integrated assessment. The entire process—from independent analysis to integrated consensus and final decision—constitutes a complete and rigorous AI-assisted diagnostic system.
     %The core innovation of this study lies in proposing the Chain-of-Thought (CoT) diagnostic system, which aims to enhance the interpretability of LLMs in CDR grading by simulating the clinical expert consultation process. The pipeline is shown in Figure “ref–fig: pipeline˝. This approach fully leverages the zero-shot reasoning capabilities of large language models, constructing a multi-stage, hierarchical decision-making framework. When selecting the core reasoning engine, we evaluated the zero-shot and few-shot reasoning performance of multiple LLMs, including Qwen2-7B-Instruct, Qwen2-7B-Instruct Zero-shot, and Qwen3-4B, to ensure the selection of the most efficient model with stable reasoning capabilities.
     %When processing each independent medical record in the test set, the system does not rely on a single model’s prediction. Instead, it performs four completely independent comprehensive analyses on the same “Subject (S)” text. This design aims to simulate real-world clinical practices of seeking expert second opinions or multidisciplinary consultations. By integrating analyses, it minimizes potential random biases inherent in single-model reasoning, thereby enhancing system robustness. During each independent analysis, the LLM is guided by highly structured prompts to generate Chain-of-Thought (CoT) reasoning paths, outputting a structured JSON report containing a preliminary CDR score with key clinical observations. By mandating this detailed intermediate reasoning layer, we successfully provide explicit clinical justification for the model’s diagnostic process.-->      <para xml:id="S3.SS2.p1">
        <p>We developed a four-stage CoT integrated diagnostic framework designed to simulate the collective reasoning process of expert clinical panels. This architecture overcomes the limitations of traditional single-threaded CoT prompts by integrating mechanisms for reasoning diversity, information fusion, logical calibration, and auditability. The entire system aims to enhance the interpretability and reliability of CDR grading, with the overall workflow illustrated in Figure <ref labelref="LABEL:fig:placeholder"/> and List <ref labelref="LABEL:lst:prompt_structure"/>.</p>
      </para>
<!--  %**** Method.tex Line 100 **** -->      <para xml:id="S3.SS2.p2">
        <p>The foundation of this framework is the CoT Generation and Diversity stage, serving as the core reasoning layer. For each patient record composed of subjective clinical notes (S) and a predefined binary CDR label pair (e.g., [0.5, 1.0]), the system initiates four independent reasoning processes. Each process is executed by a fine-tuned large language model. The model generates structured JSON-formatted reasoning outputs covering six diagnostic domains—memory, orientation, judgment and problem-solving, community affairs, family and hobbies, personal care—along with independent preliminary CDR scores. To ensure reasoning diversity and mitigate potential biases from deterministic reasoning, each reasoning attempt employs an independent random seed. This deliberate randomness, termed seed cracking, fosters diverse interpretive perspectives while maintaining consistency in the medical reasoning context. The system instantly rejects any JSON outputs with formatting errors or invalid structures, ensuring only complete analyses proceed to the next stage. This phase yields four fully independent, semantically rich diagnostic reports, each representing a unique reasoning pathway.</p>
      </para>
      <para xml:id="S3.SS2.p3">
        <p>These four assessment results are treated equally. All evaluation texts are subsequently fed into a language model, which synthesizes the texts into a single coherent diagnostic narrative from a physician’s perspective. This process integrates recurring clinical clues while resolving contradictions. The final CDR grade is determined by classifying this consolidated narrative. This means the final decision originates from the interpretation of the integrated text, rather than a direct aggregation of the initial scores.</p>
      </para>
<!--  %Building upon these outputs, the Information Aggregation Generation phase addresses the inherent subjectivity of individual CoT predictions. This stage collects all valid evaluation texts from the previous phase and feeds them into an advanced integration prompt, where the model assumes the role of a physician responsible for synthesizing multiple diagnostic opinions. Here, the model cross-validates the four reports, identifies consistent clinical observations, eliminates redundant or contradictory statements, and distills shared evidence into a unified summary. 
     %% “begin–figure˝[htbp]
     %“centering
     %% “begin–minipage˝–0.95“linewidth˝
     %“begin–lstlisting˝[style=pyStyle, caption=–Chain-of-Thought prompting structure for CDR reasoning˝, label=–lst:prompt˙structure˝, basicstyle=“ttfamily“small]
     %System prompt:
     %”You are an experienced neurologist specializing in Alzheimer’s disease.
     %Respond professionally in English and output valid JSON.”
     %User prompt:
     %# Task: Generate a full clinical analysis from a subjective note
     %Input: ”Patient reports occasional disorientation...”
     %Instructions:
     %1. Extract key reasoning steps.
     %2. Provide structured domain-specific assessment (Memory, Orientation, etc.).
     %3. Output JSON:
     %**** Method.tex Line 125 ****
     %–
     %”reasoning˙steps”: [...],
     %”assessment”: ”...”,
     %”cdr˙score”: ”[choose from 0.5, 1]”
     %˝
     %“end–lstlisting˝
     %% “end–minipage˝
     %% “end–figure˝-->      <para xml:id="S3.SS2.p4">
        <p>The Final Classification and Logical Consistency Check stage transforms the integrated assessment into an authoritative diagnostic judgment. Here, the model assumes the role of a CDR scoring expert, generating final scores based on predefined label sets. To ensure robustness and data integrity, the system programmatically extracts predicted scores from model text responses using strict regular expression patterns. Should the model generate out-of-range values (e.g., 0.0 or 2.0), the system automatically executes a clamping operation, adjusting the score to the nearest valid label (e.g., [0.5, 1.0]). This mechanism mitigates rare yet potentially disruptive prediction errors while preserving the integrity of valid reasoning chains.</p>
      </para>
      <para xml:id="S3.SS2.p5">
        <p>In the final stage, summary Generation and Auditability Enhances interpretability by translating complex multi-stage reasoning into transparent, reviewable narratives. The system re-engages the model as a senior clinical consultant, synthesizing four initial CoT reports, evaluations, and final classification outcomes to summarize the entire diagnostic process. The generated textual audit trail provides clinicians with clear visibility into model reasoning transparency, ensuring each diagnostic conclusion is traceable to its underlying evidence, with the overall output example illustrated in Figure <ref labelref="LABEL:fig:pipeline"/>.</p>
      </para>
      <para xml:id="S3.SS2.p6">
        <p>In summary, this four-phase CoT integration framework transforms traditional black-box LLM reasoning into a transparent, verifiable diagnostic process. By integrating reasoning diversity, structured aggregation, logical calibration, and explicit reasoning, the system achieves high robustness and interpretability, establishing itself as a reliable and explainable clinical AI solution for Alzheimer’s disease assessment.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:pipeline" xml:id="S3.F2">
        <tags>
          <tag>Fig. 2</tag>
          <tag role="autoref">Figure 2</tag>
          <tag role="refnum">2</tag>
          <tag role="typerefnum">Fig. 2</tag>
        </tags>
<!--  %**** Method.tex Line 150 **** -->        <graphics candidates="graph/Blank_diagram.png" class="ltx_centering" graphic="graph/Blank_diagram.png" options="width=433.62pt" xml:id="S3.F2.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">2</tag>A CoT AI Workflow for Clinical Dementia Rating (CDR) Assessment</toccaption>
        <caption class="ltx_centering"><tag close=": ">Fig. 2</tag>A CoT AI Workflow for Clinical Dementia Rating (CDR) Assessment</caption>
      </figure>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S4">
    <tags>
      <tag>IV</tag>
      <tag role="autoref">section IV</tag>
      <tag role="refnum">IV</tag>
      <tag role="typerefnum">§IV</tag>
    </tags>
    <title><tag close=" ">IV</tag><text font="smallcaps">Result</text></title>
<!--  %“begin–table*˝[htbp] 
     %“centering
     %“caption–Performance Metrics Comparison of Different Models in Binary Classification Tasks˝
     %“label–tab:model˙performance˙1˝
     %“begin–tabular˝–llccccc˝
     %“toprule
     %“textbf–Model˝ &amp; “textbf–Category˝ &amp; “textbf–Precision˝ &amp; “textbf–Recall˝ &amp; “textbf–F1-score˝ &amp; “textbf–Accuracy˝ &amp; “textbf–AUC˝ ““
     %“midrule
     %“multirow–4˝–*˝–biobert˝ &amp; 0.5 VS 1 &amp; 0.29 &amp; 0.50 &amp; 0.36 &amp; 0.57 &amp; 0.50 ““
     %&amp; 0.5 VS 2 &amp; 0.75 &amp; 0.75 &amp; 0.75 &amp; 0.75 &amp; 0.81 ““
     %&amp; 0.5 VS 3 &amp; 0.77 &amp; 0.78 &amp; 0.88 &amp; 0.78 &amp; 0.86 ““
     %&amp; 1 VS 3   &amp; 0.61 &amp; 0.58 &amp; 0.33 &amp; 0.33 &amp; 0.50 ““
     %“midrule
     %“multirow–4˝–*˝–Qwen2-1.5B-Instruct˝ &amp; 0.5 VS 1 &amp; 0.21 &amp; 0.50 &amp; 0.30 &amp; 0.43 &amp; __ ““
     %&amp; 0.5 VS 2 &amp; 0.22 &amp; 0.50 &amp; 0.30 &amp; 0.44 &amp; __ ““
     %&amp; 0.5 VS 3 &amp; 0.31 &amp; 0.50 &amp; 0.38 &amp; 0.62 &amp; __ ““
     %&amp; 1 VS 3   &amp; 0.40 &amp; 0.50 &amp; 0.45 &amp; 0.80 &amp; __ ““
     %“midrule
     %“multirow–4˝–*˝–gemma-2b˝ &amp; 0.5 VS 1 &amp; 0.45 &amp; 0.47 &amp; 0.42 &amp; 0.53 &amp; 0.51 ““
     %&amp; 0.5 VS 2 &amp; 0.64 &amp; 0.62 &amp; 0.62 &amp; 0.64 &amp; 0.68 ““
     %&amp; 0.5 VS 3 &amp; 0.57 &amp; 0.57 &amp; 0.56 &amp; 0.56 &amp; 0.66 ““
     %&amp; 1 VS 3   &amp; 0.47 &amp; 0.48 &amp; 0.47 &amp; 0.68 &amp; 0.39 ““
     %**** Result.tex Line 25 ****
     %“midrule
     %“multirow–4˝–*˝–LLaMA-7B˝ &amp; 0.5 VS 1 &amp; 0.57 &amp; 0.58 &amp; 0.57 &amp; 0.58 &amp; 0.53 ““
     %&amp; 0.5 VS 2 &amp; 0.61 &amp; 0.61 &amp; 0.60 &amp; 0.60 &amp; 0.64 ““
     %&amp; 0.5 VS 3 &amp; 0.70 &amp; 0.67 &amp; 0.68 &amp; 0.67 &amp; 0.69 ““
     %&amp; 1 VS 3   &amp; 0.54 &amp; 0.59 &amp; 0.56 &amp; 0.59 &amp; 0.42 ““
     %“midrule
     %“multirow–4˝–*˝–Medical-Llama3-8B˝ &amp; 0.5 VS 1 &amp; 0.56 &amp; 0.53 &amp; 0.53 &amp; 0.52 &amp; 0.55 ““
     %&amp; 0.5 VS 2 &amp; 0.62 &amp; 0.60 &amp; 0.56 &amp; 0.60 &amp; 0.62 ““
     %&amp; 0.5 VS 3 &amp; 0.60 &amp; 0.60 &amp; 0.60 &amp; 0.60 &amp; 0.65 ““
     %&amp; 1 VS 3   &amp; 0.47 &amp; 0.42 &amp; 0.44 &amp; 0.42 &amp; 0.49 ““
     %“bottomrule
     %“end–tabular˝
     %“end–table*˝-->    <para xml:id="S4.p1">
      <p>Under the one-versus-one binary classification strategy, we conducted a comprehensive evaluation of model performance to reveal the actual capabilities of different models in distinguishing subtle CDR levels.
<!--  %Table “ref–tab:model˙performance˙1˝ presents the performance of traditional pre-trained language model fine-tuning baselines. -->Table <ref labelref="LABEL:tab:model_performance_2"/> compares zero-shot and fine-tuned models based on large language models for CoT diagnostic system.</p>
    </para>
<!--  %To establish a robust performance baseline, we first fine-tuned multiple biomedical pre-trained language models (Bio-PLMs) such as BioBERT, gemma-2b, LLaMA-7B, and Medical-Llama3-8B for sequence classification, evaluating their performance in CDR grading based solely on patient “chief complaint (S)” text. Overall, these models demonstrated strong discriminative power in tasks with pronounced CDR differences. For instance, BioBERT achieved 78“% accuracy and an AUC of 0.86 on the 0.5 VS 3 task. This indicates that models can effectively capture key information when clinical symptoms exhibit distinct, discernible representations. 
     %However, these fine-tuned baseline models encountered challenges in distinguishing adjacent grades with subtle clinical differences, revealing the limitations of traditional black-box classification approaches. Performance plummeted for most models in the 0.5 VS 1 task. BioBERT achieved only an F1 score of 0.36 and an accuracy of 57“%; Even the larger-parameter LLaMA-7B, with its F1 score improved to 0.57, still yielded unsatisfactory results. This performance fragility indicates that traditional fine-tuning strategies tend to rely on superficial statistical features from the training set, struggling to perform deep clinical logical reasoning. Consequently, they are highly prone to judgment bias when confronting edge-of-category or imbalanced classification tasks. Furthermore, the absence of reasoning chains compromises the transparency and interpretability required for medical classification decisions.
     %To compare the potential of large language models (LLMs) under different training paradigms, we introduce the Qwen series as an LLM baseline, evaluating it in both zero-shot reasoning and fine-tuning modes. The performance of Qwen2-7B under zero-shot prompting represents the lower bound of LLM capabilities when untrained on this dataset. Its F1 scores across tasks are generally low—for instance, achieving only 0.39 in the 0.5 VS 1 task—demonstrating no advantage over traditional fine-tuned models. This confirms that even large models struggle to accurately complete complex clinical grading tasks relying solely on general knowledge when lacking domain-specific knowledge and training data.
     %**** Result.tex Line 50 ****
     %Second, LLM fine-tuned models (Qwen3-4B and Qwen2-7B) demonstrated improved performance after learning 80“% of the training data, surpassing traditional PLMs like BioBERT in certain tasks. Qwen3-4B performed particularly well, achieving accuracy rates of 68“% and 63“% in the 0.5 VS 3 and 1 VS 3 tasks, respectively. More importantly, the Qwen2-7B model achieved an accuracy of 0.80 on the 0.5 VS 1 task. Although its F1 score remained at 0.41, it demonstrated potential in certain metrics. These results indicate that LLMs, through fine-tuning, can effectively utilize contextual information to establish more efficient classification decision boundaries than traditional PLMs.However, despite these performance improvements in fine-tuned LLMs, variability persists, particularly in challenging tasks like 0.5 VS 1 and 1 VS 3, where F1 scores generally fail to surpass the 0.65 bottleneck. This limitation stems from the inherent uncertainty in single-model, single-inference approaches and their inability to effectively integrate multi-dimensional clinical information.
     %When tackling the most challenging adjacent grade classification tasks, the potential of large language models (LLMs) begins to emerge. Traditional models like BioBERT exhibit a significant performance bottleneck on the 0.5 VS 1 task, achieving an F1 score of only 0.36, revealing their limitations in distinguishing subtle clinical differences. In contrast, fine-tuned Qwen3-4B achieves an accuracy and F1 score of 0.63 on the 1 VS 3 task, outperforming all traditional baseline models.
     %To evaluate the effectiveness of the proposed Chain-of-Thought (CoT) reasoning framework, we conducted a comprehensive comparison across multiple large language models (LLMs) in both zero-shot and fine-tuning settings. The results summarized in Table “ref–tab:model˙performance˙2˝ highlight the diagnostic accuracy and interpretability of CoT-based models when distinguishing Clinical Dementia Rating (CDR) levels.
     %Overall, CoT-based diagnostic systems demonstrate significant advantages over traditional fine-tuning approaches, particularly in tasks involving subtle clinical distinctions. Among the evaluated models, Qwen3-4B and Qwen2-7B consistently achieved higher F1 scores and accuracy rates compared to the zero-shot prompting baseline. Specifically, Qwen3-4B achieved an F1 score of 0.64 and accuracy of 0.64 in the 0.5 vs 2 task, reaching 0.68 in the 0.5 vs 3 task, indicating superior discrimination capabilities for moderate and severe dementia stages. Similarly, the Qwen2-7B model maintained competitiveness across tasks, achieving 0.62 accuracy in the 0.5 vs 1 task and 0.61 accuracy in the 1 vs 3 task, while providing more interpretable intermediate reasoning chains.
     %The integration of structured Chain of Thought reasoning significantly enhances model decision consistency. Unlike traditional large language model fine-tuning relying on implicit statistical learning, the CoT framework generates explicit intermediate reasoning paths, effectively simulating clinicians’ diagnostic thought processes. Notably, multi-stage consensus reasoning reduces prediction discrepancies during iterative reasoning, stabilizing CDR grading outcomes across all test subsets. Crucially, zero-shot prompts without CoT guidance produce unstable and inconsistent outputs. Introducing the CoT reasoning structure enables the model to construct evidence-based explanations before final predictions. This interpretable reasoning process not only enhances overall classification reliability but also provides traceable evidence aligned with clinical evaluation logic. These findings highlight the potential of CoT-enhanced LLMs as transparent diagnostic assistants for Alzheimer’s disease assessment.
     %“begin–table*˝[htbp]
     %“centering
     %“caption–Performance of CoT Based Large Language Models in Binary Classification Tasks˝
     %“label–tab:model˙performance˙2˝
     %“begin–tabular˝–llccccc˝
     %“toprule
     %“textbf–Model˝ &amp; “textbf–Category˝ &amp; “textbf–Precision˝ &amp; “textbf–Recall˝ &amp; “textbf–F1-score˝ &amp; “textbf–Accuracy˝ &amp; “textbf–AUC˝ ““
     %“midrule
     %“multirow–4˝–*˝–QWEN2-7B zero shot prompting˝
     %**** Result.tex Line 75 ****
     %&amp; 0.5 VS 1 &amp; 0.55 &amp; 0.49 &amp; 0.39 &amp; 0.58 &amp; 0.51 ““
     %&amp; 0.5 VS 2 &amp; 0.64 &amp; 0.53 &amp; 0.42 &amp; 0.54 &amp; 0.53 ““
     %&amp; 0.5 VS 3 &amp; 0.62 &amp; 0.54 &amp; 0.41 &amp; 0.47 &amp; 0.54 ““
     %&amp; 1 VS 3   &amp; 0.50 &amp; 0.50 &amp; 0.44 &amp; 0.44 &amp; 0.50 ““
     %“midrule
     %“multirow–4˝–*˝–Microsoft Phi-3B (CoT)˝
     %&amp; 0.5 VS 1 &amp; 0.23 &amp; 0.50 &amp; 0.32 &amp; 0.46 &amp; 0.50 ““
     %&amp; 0.5 VS 2 &amp; 0.59 &amp; 0.58 &amp; 0.56 &amp; 0.59 &amp; 0.58 ““
     %&amp; 0.5 VS 3 &amp; 1.00 &amp; 0.33 &amp; 0.50 &amp; 0.33 &amp; 0.00 ““
     %&amp; 1 VS 3   &amp; 0.56 &amp; 0.56 &amp; 0.53 &amp; 0.53 &amp; 0.56 ““
     %“midrule
     %“multirow–4˝–*˝–QWEN3-4B (CoT)˝
     %&amp; 0.5 VS 1 &amp; 0.43 &amp; 0.45 &amp; 0.42 &amp; 0.49 &amp; 0.45 ““
     %&amp; 0.5 VS 2 &amp; 0.58 &amp; 0.53 &amp; “textbf–0.45˝ &amp; “textbf–0.56˝ &amp; 0.53 ““
     %&amp; 0.5 VS 3 &amp; 0.72 &amp; 0.55 &amp; 0.40 &amp; 0.47 &amp; 0.55 ““
     %&amp; 1 VS 3   &amp;0.46  &amp;0.47  &amp;0.32  &amp;0.32  &amp;0.47  ““
     %“midrule
     %“multirow–4˝–*˝–QWEN2-7B (CoT)˝
     %&amp; 0.5 VS 1 &amp; “textbf–0.60˝ &amp; “textbf–0.56˝ &amp; “textbf–0.54˝ &amp; “textbf–0.61˝ &amp; “textbf–0.56˝ ““
     %&amp; 0.5 VS 2 &amp; 0.29 &amp; 0.45 &amp; 0.35 &amp; 0.54 &amp; 0.45 ““
     %&amp; 0.5 VS 3 &amp; 0.54 &amp; 0.54 &amp; “textbf–0.53˝ &amp; “textbf–0.54˝ &amp; 0.54 ““
     %&amp; 1 VS 3   &amp; “textbf–0.58˝ &amp; “textbf–0.57˝ &amp; “textbf–0.50˝ &amp; “textbf–0.50˝ &amp; “textbf–0.57˝ ““
     %“bottomrule
     %“end–tabular˝
     %“end–table*˝
     %**** Result.tex Line 100 ****-->    <table inlist="lot" labels="LABEL:tab:model_performance_2" placement="htbp" xml:id="S4.T2">
      <tags>
        <tag>TABLE II</tag>
        <tag role="autoref">Table II</tag>
        <tag role="refnum">II</tag>
        <tag role="typerefnum">TABLE II</tag>
      </tags>
      <toccaption class="ltx_centering"><tag close=" ">II</tag>Performance of CoT-Based Large Language Models in Binary Classification Tasks</toccaption>
      <caption class="ltx_centering"><tag close=": ">TABLE II</tag>Performance of CoT-Based Large Language Models in Binary Classification Tasks</caption>
      <tabular class="ltx_centering ltx_guessed_headers" vattach="middle">
        <thead>
          <tr>
            <td align="left" border="tt" thead="column"><text font="bold">CDR Group</text></td>
            <td align="left" border="tt" thead="column"><text font="bold">Models</text></td>
            <td align="center" border="tt" thead="column"><text font="bold">Precision</text></td>
            <td align="center" border="tt" thead="column"><text font="bold">Recall</text></td>
            <td align="center" border="tt" thead="column"><text font="bold">F1-score</text></td>
            <td align="center" border="tt" thead="column"><text font="bold">Accuracy</text></td>
            <td align="center" border="tt" thead="column"><text font="bold">AUC</text></td>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td align="left" border="t" rowspan="4">0.5 vs. 1</td>
            <td align="left" border="t">QWEN2-7B zero shot prompting</td>
            <td align="center" border="t">0.55</td>
            <td align="center" border="t">0.49</td>
            <td align="center" border="t">0.39</td>
            <td align="center" border="t">0.58</td>
            <td align="center" border="t">0.51</td>
          </tr>
          <tr>
            <td align="left">Microsoft Phi-3B (CoT)</td>
            <td align="center">0.23</td>
            <td align="center">0.50</td>
            <td align="center">0.32</td>
            <td align="center">0.46</td>
            <td align="center">0.50</td>
          </tr>
          <tr>
            <td align="left">QWEN3-4B (CoT)</td>
            <td align="center">0.43</td>
            <td align="center">0.45</td>
            <td align="center">0.42</td>
            <td align="center">0.49</td>
            <td align="center">0.45</td>
          </tr>
          <tr>
            <td align="left">QWEN2-7B (CoT)</td>
            <td align="center"><text font="bold">0.60</text></td>
            <td align="center"><text font="bold">0.56</text></td>
            <td align="center"><text font="bold">0.54</text></td>
            <td align="center"><text font="bold">0.61</text></td>
            <td align="center"><text font="bold">0.56</text></td>
          </tr>
          <tr>
            <td align="left" border="t" rowspan="4">0.5 vs. 2</td>
            <td align="left" border="t">QWEN2-7B zero shot prompting</td>
            <td align="center" border="t">0.64</td>
            <td align="center" border="t">0.53</td>
            <td align="center" border="t">0.42</td>
            <td align="center" border="t">0.54</td>
            <td align="center" border="t">0.53</td>
          </tr>
          <tr>
            <td align="left">Microsoft Phi-3B (CoT)</td>
            <td align="center">0.59</td>
            <td align="center"><text font="bold">0.58</text></td>
            <td align="center"><text font="bold">0.56</text></td>
            <td align="center"><text font="bold">0.59</text></td>
            <td align="center"><text font="bold">0.58</text></td>
          </tr>
          <tr>
            <td align="left">QWEN3-4B (CoT)</td>
            <td align="center">0.58</td>
            <td align="center">0.53</td>
            <td align="center">0.45</td>
            <td align="center">0.56</td>
            <td align="center">0.53</td>
          </tr>
          <tr>
            <td align="left">QWEN2-7B (CoT)</td>
            <td align="center">0.29</td>
            <td align="center">0.45</td>
            <td align="center">0.35</td>
            <td align="center">0.54</td>
            <td align="center">0.45</td>
          </tr>
          <tr>
            <td align="left" border="t" rowspan="4">0.5 vs. 3</td>
            <td align="left" border="t">QWEN2-7B zero shot prompting</td>
            <td align="center" border="t">0.62</td>
            <td align="center" border="t">0.54</td>
            <td align="center" border="t">0.41</td>
            <td align="center" border="t">0.47</td>
            <td align="center" border="t">0.54</td>
          </tr>
          <tr>
            <td align="left">Microsoft Phi-3B (CoT)</td>
            <td align="center">1.00</td>
            <td align="center">0.33</td>
            <td align="center">0.50</td>
            <td align="center">0.33</td>
            <td align="center">NaN</td>
          </tr>
          <tr>
            <td align="left">QWEN3-4B (CoT)</td>
            <td align="center"><text font="bold">0.72</text></td>
            <td align="center"><text font="bold">0.55</text></td>
            <td align="center">0.40</td>
            <td align="center">0.47</td>
            <td align="center"><text font="bold">0.55</text></td>
          </tr>
          <tr>
            <td align="left">QWEN2-7B (CoT)</td>
            <td align="center">0.54</td>
            <td align="center">0.54</td>
            <td align="center"><text font="bold">0.53</text></td>
            <td align="center"><text font="bold">0.54</text></td>
            <td align="center">0.54</td>
          </tr>
          <tr>
            <td align="left" border="bb t" rowspan="4">1 vs. 3</td>
            <td align="left" border="t">QWEN2-7B zero shot prompting</td>
            <td align="center" border="t">0.50</td>
            <td align="center" border="t">0.50</td>
            <td align="center" border="t">0.44</td>
            <td align="center" border="t">0.44</td>
            <td align="center" border="t">0.50</td>
          </tr>
          <tr>
            <td align="left">Microsoft Phi-3B (CoT)</td>
            <td align="center">0.56</td>
            <td align="center">0.56</td>
            <td align="center"><text font="bold">0.53</text></td>
            <td align="center"><text font="bold">0.53</text></td>
            <td align="center">0.56</td>
          </tr>
          <tr>
            <td align="left">QWEN3-4B (CoT)</td>
            <td align="center">0.46</td>
            <td align="center">0.47</td>
            <td align="center">0.32</td>
            <td align="center">0.32</td>
            <td align="center">0.47</td>
          </tr>
          <tr>
            <td align="left" border="bb">QWEN2-7B (CoT)</td>
            <td align="center" border="bb"><text font="bold">0.58</text></td>
            <td align="center" border="bb"><text font="bold">0.57</text></td>
            <td align="center" border="bb">0.50</td>
            <td align="center" border="bb">0.50</td>
            <td align="center" border="bb"><text font="bold">0.57</text></td>
          </tr>
        </tbody>
      </tabular>
    </table>
    <figure inlist="lof" labels="LABEL:fig:f1" xml:id="S4.F3">
      <tags>
        <tag>Fig. 3</tag>
        <tag role="autoref">Figure 3</tag>
        <tag role="refnum">3</tag>
        <tag role="typerefnum">Fig. 3</tag>
      </tags>
      <graphics candidates="graph/f1.png" class="ltx_centering" graphic="graph/f1.png" options="width=433.62pt" xml:id="S4.F3.g1"/>
      <toccaption class="ltx_centering"><tag close=" ">3</tag>F1 Score Improvement of CoT-Based Models over Zero-Shot Prompting</toccaption>
      <caption class="ltx_centering"><tag close=": ">Fig. 3</tag>F1 Score Improvement of CoT-Based Models over Zero-Shot Prompting</caption>
    </figure>
<!--  %**** Result.tex Line 150 **** 
     %“begin–table*˝[htbp]
     %“centering
     %“caption–Performance of CoT Based Large Language Models in Binary Classification Tasks˝
     %“label–tab:model˙performance˙2˝
     %“begin–tabular˝–llccccc˝
     %“toprule
     %“textbf–Model˝ &amp; “textbf–Category˝ &amp; “textbf–Precision˝ &amp; “textbf–Recall˝ &amp; “textbf–F1-score˝ &amp; “textbf–Accuracy˝ &amp; “textbf–AUC˝ ““
     %“midrule
     %“multirow–4˝–*˝–QWEN2-7B zero shot prompting˝
     %&amp; QWEN2-7B zero shot prompting &amp; 0.55 &amp; 0.49 &amp; 0.39 &amp; 0.58 &amp; 0.51 ““
     %&amp; QWEN2-7B zero shot prompting &amp; 0.64 &amp; 0.53 &amp; 0.42 &amp; 0.54 &amp; 0.53 ““
     %&amp; QWEN2-7B zero shot prompting &amp; 0.62 &amp; 0.54 &amp; 0.41 &amp; 0.47 &amp; 0.54 ““
     %&amp; QWEN2-7B zero shot prompting   &amp; 0.50 &amp; 0.50 &amp; 0.44 &amp; 0.44 &amp; 0.50 ““
     %“midrule
     %“multirow–4˝–*˝–Microsoft Phi-3B (CoT)˝
     %&amp; Microsoft Phi-3B (CoT) &amp; 0.23  &amp; 0.50  &amp; 0.32  &amp; 0.46  &amp; 0.50 (-0.01) ““
     %&amp; Microsoft Phi-3B (CoT) &amp; 0.59  &amp; 0.58 (+0.05) &amp; 0.56 (+0.14) &amp; 0.59 (+0.05) &amp; 0.58 (+0.05) ““
     %&amp; Microsoft Phi-3B (CoT) &amp; 1.00  &amp; 0.33 (-0.21) &amp; 0.50 (+0.09) &amp; 0.33 (-0.14) &amp; 0.00 (-0.54) ““
     %&amp; Microsoft Phi-3B (CoT)   &amp; 0.56  &amp; 0.56 (+0.06) &amp; 0.53 (+0.09) &amp; 0.53 (+0.09) &amp; 0.56 (+0.06) ““
     %“midrule
     %“multirow–4˝–*˝–QWEN3-4B (CoT)˝
     %&amp; QWEN3-4B (CoT) &amp; 0.43 (-0.12) &amp; 0.45 (-0.04) &amp; 0.42 (+0.03) &amp; 0.49 (-0.09) &amp; 0.45 (-0.06) ““
     %&amp; QWEN3-4B (CoT) &amp; 0.58 (-0.06) &amp; 0.53 (+0.00) &amp; 0.45 (+0.03) &amp; 0.56 (+0.02) &amp; 0.53 (+0.00) ““
     %&amp; QWEN3-4B (CoT) &amp; 0.72 (+0.10) &amp; 0.55 (+0.01) &amp; 0.40 (-0.01) &amp; 0.47 (+0.00) &amp; 0.55 (+0.01) ““
     %**** Result.tex Line 175 ****
     %&amp; QWEN3-4B (CoT)   &amp; 0.46 (-0.04) &amp; 0.47 (-0.03) &amp; 0.32 (-0.12) &amp; 0.32 (-0.12) &amp; 0.47 (-0.03) ““
     %“midrule
     %“multirow–4˝–*˝–QWEN2-7B (CoT)˝
     %&amp; QWEN2-7B (CoT) &amp; “textbf–0.60˝ (+0.05) &amp; “textbf–0.56˝ (+0.07) &amp; “textbf–0.54˝ (+0.15) &amp; “textbf–0.61˝ (+0.03) &amp; “textbf–0.56˝ (+0.05) ““
     %&amp; QWEN2-7B (CoT) &amp; 0.29 (-0.35) &amp; 0.45 (-0.08) &amp; 0.35 (-0.07) &amp; 0.54 (+0.00) &amp; 0.45 (-0.08) ““
     %&amp; QWEN2-7B (CoT) &amp; 0.54 (-0.08) &amp; 0.54 (+0.00) &amp; “textbf–0.53˝ (+0.12) &amp; “textbf–0.54˝ (+0.07) &amp; 0.54 (+0.00) ““
     %&amp; QWEN2-7B (CoT)   &amp; “textbf–0.58˝ (+0.08) &amp; “textbf–0.57˝ (+0.07) &amp; “textbf–0.50˝ (+0.06) &amp; “textbf–0.50˝ (+0.06) &amp; “textbf–0.57˝ (+0.07) ““
     %“bottomrule
     %“end–tabular˝
     %“end–table*˝-->    <para xml:id="S4.p2">
      <p>To evaluate the effectiveness of the proposed CoT framework, we conducted a comprehensive comparison across multiple LLMs under two distinct settings: standard zero-shot prompting, involving single-shot inference without explicit reasoning steps, and CoT-enhanced reasoning, involving stepwise intermediate reasoning. Experimental results are presented in Table <ref labelref="LABEL:tab:model_performance_2"/> and F1 scores across different models are shown in Figure <ref labelref="LABEL:fig:f1"/>. Findings demonstrate significant performance gains through CoT integration, particularly in distinguishing binary classification tasks across different CDR levels.</p>
    </para>
    <para xml:id="S4.p3">
      <p>The CoT-based diagnostic system outperformed traditional prompting methods across most evaluation metrics, achieving superior F1 scores, accuracy, and AUC values. For instance, in the 0.5 vs. 1.0 classification task, the Qwen2-7B (CoT) model achieved an F1 score of 0.54 and accuracy of 0.61, outperforming the zero-shot version (F1 = 0.39, accuracy = 0.58) by +0.15 F1 score, demonstrating the effectiveness of explicit reasoning in fine-grained clinical differentiation. Similarly, the Qwen3-4B (CoT) model showed substantial improvement in fine-grained discrimination tasks like 0.5 vs. 2 (F1 = 0.45, accuracy = 0.56). These results indicate that structured reasoning effectively enhances models’ ability to distinguish borderline dementia stages. The Qwen2-7B (CoT) model exhibits the most stable performance characteristics, maintaining balanced precision and recall across all task pairs. Notably, both metrics exceed 0.57 in the 1 vs 3 classification task. This stability indicates the model’s ability to handle broader CDR variations while preserving decision reliability. In contrast, the Qwen3-4B (CoT) model performs stronger on moderate classification tasks (e.g., 0.5 vs 2) but shows declining performance on more distant class pairs (e.g., 1 vs 3), suggesting that its smaller parameter size may limit its generalization ability across varying degrees of severity.</p>
    </para>
    <para xml:id="S4.p4">
      <p>In contrast, the Phi-3B (CoT) model exhibits significant performance fluctuations, revealing its limitations in complex diagnostic reasoning tasks. Although the model achieves competitive recall rates in certain subtasks, its precision and AUC values fluctuate dramatically, indicating unstable decision boundaries and inconsistent reasoning chains. For instance, the Phi-3B model achieved an F1 score of 0.56 in the 0.5 vs 2 task, yet its performance plummeted to 0.50 in the 0.5 vs 3 task. These inconsistencies suggest that small-scale models may struggle to maintain logical coherence across multi-step reasoning sequences, highlighting the need for sufficient model capacity to ensure the stability of CoT-based reasoning in medical diagnostic scenarios.</p>
    </para>
    <para xml:id="S4.p5">
      <p>Analysis of cross-model F1 score trends reveals model-scale effects and algorithmic influences. Qwen2-7B (CoT) consistently maintains the highest and most stable F1 score performance across all CDR tasks, indicating that greater model capacity enables construction of more coherent reasoning chains and enhances diagnostic discrimination capabilities. In contrast, Qwen3-4B (CoT) shows moderate gains at moderate discrepancy levels (0.5 vs 2) but exhibits weaker stability on distant category pairs (1 vs 3), indicating limited generalization in scenarios with severe cognitive discrepancies. Meanwhile, Phi-3B (CoT) exhibits fluctuating F1 scores, reflecting the sensitivity of smaller architectures to reasoning depth and task complexity. These trends collectively demonstrate that parameter scale and CoT prompt design synergistically enhance reasoning fidelity and stability. Notably, models exceeding 7 billion parameters appear to strike a critical balance between reasoning diversity and consistency, yielding more reliable diagnostic outputs.</p>
    </para>
    <para xml:id="S4.p6">
      <p>Simultaneously, we observed the performance of multi-stage CoT frameworks on heterogeneous CDR classification tasks. Although absolute F1 scores fluctuated with task difficulty, CoT-enhanced models (particularly Qwen2-7B) avoided failure and maintained balanced precision-recall curves across all four binary settings. In contrast, the smaller Phi-3B (CoT) model exhibited significant instability, indicating poor discrimination capabilities under the most challenging scenarios. These results demonstrate that multi-stage reasoning structures, when combined with sufficient model capacity, can mitigate extreme performance fluctuations and maintain consistent decision boundaries across both adjacent and distant CDR comparisons.</p>
    </para>
    <para xml:id="S4.p7">
      <p>CoT reasoning enhances diagnostic consistency and interpretability. Unlike traditional LLM prompts relying solely on implicit statistical correlations, CoT-based reasoning generates transparent intermediate reasoning paths that mirror clinicians’ cognitive processes during dementia assessments. Furthermore, the multi-stage in CoT reasoning mitigates output instability by enforcing internal logical consistency between reasoning steps. In contrast, zero-shot prompts lacking CoT guidance often yield inconsistent and less reliable predictions due to the absence of structured reasoning supervision.
<!--  %**** Result.tex Line 200 **** --></p>
    </para>
<!--  %Beyond quantitative classification metrics, we further evaluated textual consistency between generated assessment narratives (“Generated Final A”) and original clinician records (“Original A”) using ROUGE-L scores. The results are shown in Figure “ref–fig: rougel˝. Almost all binary classification tasks, generated assessment reports consistently achieved ROUGE-L scores ¿0.09, indicating stable lexical overlap and alignment with clinically meaningful reasoning structures. Notably, the 1 vs 3 classification task achieved an exceptional ROUGE-L score exceeding 0.95, reflecting textual coherence and consistency between model-generated reasoning and original expert assessments. In contrast, zero-shot prompting methods significantly underperformed in text-level consistency, with ROUGE-L scores below 0.70 across all tasks. This substantial performance gap demonstrates that explicit thought chain reasoning not only enhances diagnostic interpretability but also ensures generated clinical narratives maintain structural and semantic consistency. 
     %“begin–figure*˝
     %“centering
     %“includegraphics[width=“linewidth]–graph/rougeL˙comparison.png˝
     %“caption–ROUGE-L across Binary CDR Tasks˝
     %“label–fig: rougel˝
     %“end–figure*˝
     %“input–section/Research and analysis˝-->  </section>
  <section inlist="toc" xml:id="S5">
    <tags>
      <tag>V</tag>
      <tag role="autoref">section V</tag>
      <tag role="refnum">V</tag>
      <tag role="typerefnum">§V</tag>
    </tags>
    <title><tag close=" ">V</tag><text font="smallcaps">Discussion</text></title>
<!--  %Experimental results demonstrate that the proposed Chain-of-Thought (CoT) reasoning-based diagnostic framework significantly enhances model interpretability and diagnostic consistency in Alzheimer’s disease (AD) assessment, exhibiting distinct advantages over conventional fine-tuning approaches. While traditional biomedical pre-trained language models (BioPLMs), such as BioBERT and LLaMA-7B, exhibit high accuracy in distinguishing clearly differentiated CDR grades, their performance fluctuates considerably when confronted with clinically ambiguous thresholds (e.g., CDR 0.5 vs. 1.0). This phenomenon reveals the limitations of black-box classification models, which primarily rely on superficial textual features while lacking clinical reasoning capabilities. -->    <para xml:id="S5.p1">
      <p><text color="#000000">Clinical dementia assessments based on electronic health records rely on interpreting cognitive and functional behavioral descriptions recorded in routine clinical practice. These descriptions are often heterogeneous, implicit, and context-dependent, posing challenges for automated evaluation. In such scenarios, effective assessment depends not only on predictive accuracy but also on the stability of reasoning processes, the transparency of intermediate judgements, and the ability to trace diagnostic conclusions back to observable clinical evidence. To address clinical behavioral assessment, this study employs a structured, multi-stage CoT reasoning process to organize intermediate inferences and support consistent interpretation of clinical narratives. This design enables systematic analysis of complex behavioral information through explicit, auditable reasoning steps.</text></p>
    </para>
    <para xml:id="S5.p2">
      <p>Experimental results validate the effectiveness of integrating CoT reasoning into large language models for Alzheimer’s disease assessment. Compared to existing diagnostic frameworks based on CoT or reasoning, this system focuses on structured electronic health record analysis rather than speech or synthetic datasets <cite class="ltx_citemacro_cite">[<bibref bibrefs="park2025reasoning" separator="," yyseparator=","/>]</cite>, providing evaluations with greater clinical evidence value. Furthermore, unlike general medical reasoning studies, this model is specifically optimized for Alzheimer’s disease staging and validated across diverse patient datasets, demonstrating robustness and scalability. These findings highlight the practical application potential of LLM-based diagnostic systems for real-world dementia assessment while maintaining transparency and clinical interpretability. Furthermore, unlike studies on general medical reasoning, our model is specifically optimized for Alzheimer’s disease staging and validated across diverse patient datasets, demonstrating robustness and scalability <cite class="ltx_citemacro_cite">[<bibref bibrefs="li2025care" separator="," yyseparator=","/>]</cite>. From a clinical perspective, the enhancement of reasoning stability increases clinician confidence, reduces diagnostic variability among raters, and thereby strengthens the interpretability and reliability of AI-assisted assessments. These findings highlight the practical potential of LLM-based diagnostic systems for real-world dementia evaluation while maintaining transparency and clinical interpretability.</p>
    </para>
<!--  %Compared to traditional black-box fine-tuning approaches, the CoT framework introduces a transparent diagnostic process that closely aligns with the cognitive reasoning methods employed by clinicians. By explicitly generating structured intermediate reasoning steps, the model not only enhances the interpretability of diagnostic outcomes but also improves decision stability in challenging, fine-grained CDR classification tasks. -->    <para xml:id="S5.p3">
      <p>This study demonstrates that integrating CoT reasoning into large language models significantly enhances their diagnostic stability and interpretability in CDR assessments. Beyond numerical performance gains, the CoT framework transforms conventional large language models into structured reasoning systems capable of articulating clinical evidence, weighing diagnostic clues, and integrating multi-perspective judgments. Consistent improvements observed across both Qwen2-7B and Qwen3-4B models indicate that CoT integration offers benefits independent of specific model architectures.</p>
    </para>
    <para xml:id="S5.p4">
      <p>The CoT-based diagnostic system introduces a transparent reasoning mechanism aligned with clinical expert decision-making processes. By explicitly generating intermediate reasoning paths and employing multi-stage evaluation, this system transforms the previously opaque prediction process into a traceable, verifiable chain of reasoning, significantly enhancing the model’s credibility in medical applications. Simultaneously, the multi-agent design effectively mitigates the randomness inherent in large language models during reasoning, leading to more stable performance on complex borderline tasks.</p>
    </para>
    <para xml:id="S5.p5">
      <p>It should be noted that while CoT reasoning incurs additional computational overhead (due to multiple inference steps), this trade-off is acceptable as the resulting gains in diagnostic reliability and interpretability hold significant value for clinical applications. This approach bridges automated prediction with explainable AI, providing a scalable technical foundation for future deployment in real-world medical settings.</p>
    </para>
<!--  %Nevertheless, this study has certain limitations. First, despite introducing explicit reasoning mechanisms, the model’s ability to distinguish between adjacent CDR levels remains dependent on the diversity and scale of training data. Second, current CoT reasoning is based solely on text inputs and has not yet integrated imaging information such as MRI or PET scans, which are crucial for AD diagnosis. Future research should focus on integrating CoT reasoning with multimodal fusion models and longitudinal EHR data to comprehensively characterize disease progression. Additionally, external validation on larger, multicenter clinical datasets will further enhance the system’s generalization and clinical applicability. -->    <para xml:id="S5.p6">
      <p>Although these findings are encouraging, the study has limitations. The dataset size is relatively small, and the analysis relies solely on text-based electronic health records (EHRs), excluding imaging data such as magnetic resonance imaging (MRI) or positron emission tomography (PET). Furthermore, external validation using independent cohorts is still required to confirm its generalizability. Future research will explore multimodal integration (e.g., combining EHR and MRI features) and human-machine collaborative validation frameworks to enhance interpretability and clinical reliability.</p>
    </para>
  </section>
  <section inlist="toc" xml:id="S6">
    <tags>
      <tag>VI</tag>
      <tag role="autoref">section VI</tag>
      <tag role="refnum">VI</tag>
      <tag role="typerefnum">§VI</tag>
    </tags>
    <title><tag close=" ">VI</tag><text font="smallcaps">Conclusion</text></title>
    <para xml:id="S6.p1">
      <p>This study proposes a CoT reasoning framework based on large language models for the clinical assessment and diagnosis of AD. Unlike traditional black-box fine-tuning approaches, this system explicitly simulates the clinical physician’s reasoning process by generating intermediate diagnostic inferences and multi-layer evaluations. Experimental results demonstrate that the proposed CoT diagnostic system not only significantly enhances model interpretability and decision transparency but also exhibits higher consistency and stability in distinguishing between adjacent CDR grades. Enhancing reasoning transparency not only improves interpretability at the individual diagnostic level but also establishes a scalable framework for trustworthy clinical artificial intelligence. This advancement facilitates the integration of interpretable large language models into healthcare systems.</p>
    </para>
    <para xml:id="S6.p2">
      <p>By transforming the model’s reasoning process into a structured, traceable chain of reasoning, this study establishes a novel connection between automated prediction and explainable artificial intelligence, providing a scalable technical foundation for large language models in clinical decision support systems.</p>
    </para>
    <para xml:id="S6.p3">
      <p>Future research will further expand the CoT reasoning framework and integrate longitudinal EHR information to achieve multimodal diagnostic fusion. This will enable a more comprehensive characterization of disease dynamics and advance precision medicine applications in dementia diagnosis. Additionally, external validation using multicenter datasets will be conducted, alongside exploring the design of human-machine collaborative reasoning systems, laying the groundwork for practical deployment in clinical settings.</p>
    </para>
<!--  %“section–Introduction˝ 
     %**** conference˙101719.tex Line 175 ****
     %This document is a model and instructions for “LaTeX.
     %Please observe the conference page limits.
     %“section–Ease of Use˝
     %“subsection–Maintaining the Integrity of the Specifications˝
     %The IEEEtran class file is used to format your paper and style the text. All margins,
     %column widths, line spaces, and text fonts are prescribed; please do not
     %alter them. You may note peculiarities. For example, the head margin
     %measures proportionately more than is customary. This measurement
     %and others are deliberate, using specifications that anticipate your paper
     %as one part of the entire proceedings, and not as an independent document.
     %Please do not revise any of the current designations.
     %“section–Prepare Your Paper Before Styling˝
     %Before you begin to format your paper, first write and save the content as a
     %separate text file. Complete all content and organizational editing before
     %formatting. Please note sections “ref–AA˝__“ref–SCM˝ below for more information on
     %proofreading, spelling and grammar.
     %Keep your text and graphic files separate until after the text has been
     %formatted and styled. Do not number text heads__–“LaTeX˝ will do that
     %for you.
     %**** conference˙101719.tex Line 200 ****
     %“subsection–Abbreviations and Acronyms˝“label–AA˝
     %Define abbreviations and acronyms the first time they are used in the text,
     %even after they have been defined in the abstract. Abbreviations such as
     %IEEE, SI, MKS, CGS, ac, dc, and rms do not have to be defined. Do not use
     %abbreviations in the title or heads unless they are unavoidable.
     %“subsection–Units˝
     %“begin–itemize˝
     %“item Use either SI (MKS) or CGS as primary units. (SI units are encouraged.) English units may be used as secondary units (in parentheses). An exception would be the use of English units as identifiers in trade, such as ‘‘3.5-inch disk drive’’.
     %“item Avoid combining SI and CGS units, such as current in amperes and magnetic field in oersteds. This often leads to confusion because equations do not balance dimensionally. If you must use mixed units, clearly state the units for each quantity that you use in an equation.
     %“item Do not mix complete spellings and abbreviations of units: ‘‘Wb/m“textsuperscript–2˝’’ or ‘‘webers per square meter’’, not ‘‘webers/m“textsuperscript–2˝’’. Spell out units when they appear in text: ‘‘. . . a few henries’’, not ‘‘. . . a few H’’.
     %“item Use a zero before decimal points: ‘‘0.25’’, not ‘‘.25’’. Use ‘‘cm“textsuperscript–3˝’’, not ‘‘cc’’.)
     %“end–itemize˝
     %“subsection–Equations˝
     %Number equations consecutively. To make your
     %equations more compact, you may use the solidus (~/~), the exp function, or
     %appropriate exponents. Italicize Roman symbols for quantities and variables,
     %but not Greek symbols. Use a long dash rather than a hyphen for a minus
     %sign. Punctuate equations with commas or periods when they are part of a
     %sentence, as in:
     %“begin–equation˝
     %a+b=“gamma“label–eq˝
     %“end–equation˝
     %**** conference˙101719.tex Line 225 ****
     %Be sure that the
     %symbols in your equation have been defined before or immediately following
     %the equation. Use ‘‘“eqref–eq˝’’, not ‘‘Eq.~“eqref–eq˝’’ or ‘‘equation “eqref–eq˝’’, except at
     %the beginning of a sentence: ‘‘Equation “eqref–eq˝ is . . .’’
     %“subsection–“LaTeX-Specific Advice˝
     %Please use ‘‘soft’’ (e.g., “verb—“eqref–Eq˝—) cross references instead
     %of ‘‘hard’’ references (e.g., “verb—(1)—). That will make it possible
     %to combine sections, add equations, or change the order of figures or
     %citations without having to go through the file line by line.
     %Please don’t use the “verb—–eqnarray˝— equation environment. Use
     %“verb—–align˝— or “verb—–IEEEeqnarray˝— instead. The “verb—–eqnarray˝—
     %environment leaves unsightly spaces around relation symbols.
     %Please note that the “verb—–subequations˝— environment in –“LaTeX˝
     %will increment the main equation counter even when there are no
     %equation numbers displayed. If you forget that, you might write an
     %article in which the equation numbers skip from (17) to (20), causing
     %the copy editors to wonder if you’ve discovered a new method of
     %counting.
     %–“BibTeX˝ does not work by magic. It doesn’t get the bibliographic
     %data from thin air but from .bib files. If you use –“BibTeX˝ to produce a
     %**** conference˙101719.tex Line 250 ****
     %bibliography you must send the .bib files.
     %–“LaTeX˝ can’t read your mind. If you assign the same label to a
     %subsubsection and a table, you might find that Table I has been cross
     %referenced as Table IV-B3.
     %–“LaTeX˝ does not have precognitive abilities. If you put a
     %“verb—“label— command before the command that updates the counter it’s
     %supposed to be using, the label will pick up the last counter to be
     %cross referenced instead. In particular, a “verb—“label— command
     %should not go before the caption of a figure or a table.
     %Do not use “verb—“nonumber— inside the “verb—–array˝— environment. It
     %will not stop equation numbers inside “verb—–array˝— (there won’t be
     %any anyway) and it might stop a wanted equation number in the
     %surrounding equation.
     %“subsection–Some Common Mistakes˝“label–SCM˝
     %“begin–itemize˝
     %“item The word ‘‘data’’ is plural, not singular.
     %“item The subscript for the permeability of vacuum $“mu˙–0˝$, and other common scientific constants, is zero with subscript formatting, not a lowercase letter ‘‘o’’.
     %“item In American English, commas, semicolons, periods, question and exclamation marks are located within quotation marks only when a complete thought or name is cited, such as a title or full quotation. When quotation marks are used, instead of a bold or italic typeface, to highlight a word or phrase, punctuation should appear outside of the quotation marks. A parenthetical phrase or statement at the end of a sentence is punctuated outside of the closing parenthesis (like this). (A parenthetical sentence is punctuated within the parentheses.)
     %“item A graph within a graph is an ‘‘inset’’, not an ‘‘insert’’. The word alternatively is preferred to the word ‘‘alternately’’ (unless you really mean something that alternates).
     %“item Do not use the word ‘‘essentially’’ to mean ‘‘approximately’’ or ‘‘effectively’’.
     %“item In your paper title, if the words ‘‘that uses’’ can accurately replace the word ‘‘using’’, capitalize the ‘‘u’’; if not, keep using lower-cased.
     %**** conference˙101719.tex Line 275 ****
     %“item Be aware of the different meanings of the homophones ‘‘affect’’ and ‘‘effect’’, ‘‘complement’’ and ‘‘compliment’’, ‘‘discreet’’ and ‘‘discrete’’, ‘‘principal’’ and ‘‘principle’’.
     %“item Do not confuse ‘‘imply’’ and ‘‘infer’’.
     %“item The prefix ‘‘non’’ is not a word; it should be joined to the word it modifies, usually without a hyphen.
     %“item There is no period after the ‘‘et’’ in the Latin abbreviation ‘‘et al.’’.
     %“item The abbreviation ‘‘i.e.’’ means ‘‘that is’’, and the abbreviation ‘‘e.g.’’ means ‘‘for example’’.
     %“end–itemize˝
     %An excellent style manual for science writers is “cite–b7˝.
     %“subsection–Authors and Affiliations˝
     %“textbf–The class file is designed for, but not limited to, six authors.˝ A
     %minimum of one author is required for all conference articles. Author names
     %should be listed starting from left to right and then moving down to the
     %next line. This is the author sequence that will be used in future citations
     %and by indexing services. Names should not be listed in columns nor group by
     %affiliation. Please keep your affiliations as succinct as possible (for
     %example, do not differentiate among departments of the same organization).
     %“subsection–Identify the Headings˝
     %Headings, or heads, are organizational devices that guide the reader through
     %your paper. There are two types: component heads and text heads.
     %Component heads identify the different components of your paper and are not
     %topically subordinate to each other. Examples include Acknowledgments and
     %References and, for these, the correct style to use is ‘‘Heading 5’’. Use
     %‘‘figure caption’’ for your Figure captions, and ‘‘table head’’ for your
     %**** conference˙101719.tex Line 300 ****
     %table title. Run-in heads, such as ‘‘Abstract’’, will require you to apply a
     %style (in this case, italic) in addition to the style provided by the drop
     %down menu to differentiate the head from the text.
     %Text heads organize the topics on a relational, hierarchical basis. For
     %example, the paper title is the primary text head because all subsequent
     %material relates and elaborates on this one topic. If there are two or more
     %sub-topics, the next level head (uppercase Roman numerals) should be used
     %and, conversely, if there are not at least two sub-topics, then no subheads
     %should be introduced.
     %“subsection–Figures and Tables˝
     %“paragraph–Positioning Figures and Tables˝ Place figures and tables at the top and
     %bottom of columns. Avoid placing them in the middle of columns. Large
     %figures and tables may span across both columns. Figure captions should be
     %below the figures; table heads should appear above the tables. Insert
     %figures and tables after they are cited in the text. Use the abbreviation
     %‘‘Fig.~“ref–fig˝’’, even at the beginning of a sentence.
     %“begin–table˝[htbp]
     %“caption–Table Type Styles˝
     %“begin–center˝
     %“begin–tabular˝–—c—c—c—c—˝
     %“hline
     %“textbf–Table˝&amp;“multicolumn–3˝–—c—˝–“textbf–Table Column Head˝˝ ““
     %**** conference˙101719.tex Line 325 ****
     %“cline–2-4˝
     %“textbf–Head˝ &amp; “textbf–“textit–Table column subhead˝˝&amp; “textbf–“textit–Subhead˝˝&amp; “textbf–“textit–Subhead˝˝ ““
     %“hline
     %copy&amp; More table copy$^–“mathrm–a˝˝$&amp; &amp;  ““
     %“hline
     %“multicolumn–4˝–l˝–$^–“mathrm–a˝˝$Sample of a Table footnote.˝
     %“end–tabular˝
     %“label–tab1˝
     %“end–center˝
     %“end–table˝
     %“begin–figure˝[htbp]
     %“centerline–“includegraphics–fig1.png˝˝
     %“caption–Example of a figure caption.˝
     %“label–fig˝
     %“end–figure˝
     %Figure Labels: Use 8 point Times New Roman for Figure labels. Use words
     %rather than symbols or abbreviations when writing Figure axis labels to
     %avoid confusing the reader. As an example, write the quantity
     %‘‘Magnetization’’, or ‘‘Magnetization, M’’, not just ‘‘M’’. If including
     %units in the label, present them within parentheses. Do not label axes only
     %with units. In the example, write ‘‘Magnetization (A/m)’’ or ‘‘Magnetization
     %“–A[m(1)]“˝’’, not just ‘‘A/m’’. Do not label axes with a ratio of
     %quantities and units. For example, write ‘‘Temperature (K)’’, not
     %**** conference˙101719.tex Line 350 ****
     %‘‘Temperature/K’’.
     %“section*–Acknowledgment˝
     %“section*–References˝
     %Please number citations consecutively within brackets “cite–b1˝. The
     %sentence punctuation follows the bracket “cite–b2˝. Refer simply to the reference
     %number, as in “cite–b3˝__do not use ‘‘Ref. “cite–b3˝’’ or ‘‘reference “cite–b3˝’’ except at
     %the beginning of a sentence: ‘‘Reference “cite–b3˝ was the first $“ldots$’’
     %Number footnotes separately in superscripts. Place the actual footnote at
     %the bottom of the column in which it was cited. Do not put footnotes in the
     %abstract or reference list. Use letters for table footnotes.
     %Unless there are six authors or more give all authors’ names; do not use
     %‘‘et al.’’. Papers that have not been published, even if they have been
     %submitted for publication, should be cited as ‘‘unpublished’’ “cite–b4˝. Papers
     %that have been accepted for publication should be cited as ‘‘in press’’ “cite–b5˝.
     %Capitalize only the first word in a paper title, except for proper nouns and
     %element symbols.
     %For papers published in translation journals, please give the English
     %citation first, followed by the original foreign-language citation “cite–b6˝.
     %**** conference˙101719.tex Line 375 ****
     %“begin–thebibliography˝–00˝
     %“bibitem–b1˝ G. Eason, B. Noble, and I. N. Sneddon, ‘‘On certain integrals of Lipschitz-Hankel type involving products of Bessel functions,’’ Phil. Trans. Roy. Soc. London, vol. A247, pp. 529__551, April 1955.
     %“bibitem–b2˝ J. Clerk Maxwell, A Treatise on Electricity and Magnetism, 3rd ed., vol. 2. Oxford: Clarendon, 1892, pp.68__73.
     %“bibitem–b3˝ I. S. Jacobs and C. P. Bean, ‘‘Fine particles, thin films and exchange anisotropy,’’ in Magnetism, vol. III, G. T. Rado and H. Suhl, Eds. New York: Academic, 1963, pp. 271__350.
     %“bibitem–b4˝ K. Elissa, ‘‘Title of paper if known,’’ unpublished.
     %“bibitem–b5˝ R. Nicole, ‘‘Title of paper with only first word capitalized,’’ J. Name Stand. Abbrev., in press.
     %“bibitem–b6˝ Y. Yorozu, M. Hirano, K. Oka, and Y. Tagawa, ‘‘Electron spectroscopy studies on magneto-optical media and plastic substrate interface,’’ IEEE Transl. J. Magn. Japan, vol. 2, pp. 740__741, August 1987 [Digests 9th Annual Conf. Magnetics Japan, p. 301, 1982].
     %“bibitem–b7˝ M. Young, The Technical Writer’s Handbook. Mill Valley, CA: University Science, 1989.
     %“end–thebibliography˝
     %“vspace–12pt˝
     %“color–red˝
     %IEEE conference templates contain guidance text for composing and formatting conference papers. Please ensure that all template text is removed from your conference paper prior to submission to the conference. Failure to remove the template text from your paper may result in your paper not being published.-->  </section>
  <bibliography bibstyle="IEEEtran" citestyle="numbers" files="ref" xml:id="bib">
    <title>References</title>
  </bibliography>
</document>
