<?xml version="1.0" encoding="UTF-8"?>
<?latexml searchpaths="/home/japhy/scienceReplication.artiswrong.com/paper_files/arxiv/2602.14655/latex_extracted"?>
<!--  %Template for ICASSP-2026 paper; to be used with: --><!--  %spconf.sty  - ICASSP/ICIP LaTeX style file, and --><!--  %IEEEbib.bst - IEEE bibliography style file. --><!--  %__ --><?latexml class="article"?>
<?latexml package="spconf,amsmath,graphicx,hyperref"?>
<?latexml package="amssymb"?>
<?latexml package="multirow"?>
<?latexml package="booktabs"?>
<?latexml package="ulem" options="normalem"?>
<?latexml RelaxNGSchema="LaTeXML"?>
<document xmlns="http://dlmf.nist.gov/LaTeXML" class="ltx_authors_1line">
  <resource src="LaTeXML.css" type="text/css"/>
  <resource src="ltx-article.css" type="text/css"/>
  <resource src="ltx-ulem.css" type="text/css"/>
  <title>Breaking Data Efficiency Dilemma: A Federated and Augmented Learning Framework For Alzheimer’s Disease Detection via Speech</title>
  <abstract name="Abstract">
    <p>Early diagnosis of Alzheimer’s Disease (AD) is crucial for delaying its progression. While AI-based speech detection is non-invasive and cost-effective, it faces a critical data efficiency dilemma due to medical data scarcity and privacy barriers. Therefore, we propose FAL-AD, a novel framework that synergistically integrates federated learning with data augmentation to systematically optimize data efficiency. Our approach delivers three key breakthroughs: First, absolute efficiency improvement through voice conversion-based augmentation, which generates diverse pathological speech samples via cross-category voice-content recombination. Second, collaborative efficiency breakthrough via an adaptive federated learning paradigm, maximizing cross-institutional benefits under privacy constraints. Finally, representational efficiency optimization by an attentive cross-modal fusion model, which achieves fine-grained word-level alignment and acoustic-textual interaction. Evaluated on ADReSSo, FAL-AD achieves a state-of-the-art multi-modal accuracy of 91.52%, outperforming all centralized baselines and demonstrating a practical solution to the data efficiency dilemma. Our source code is publicly available at <ref class="ltx_url" font="typewriter" href="https://github.com/smileix/fal-ad">https://github.com/smileix/fal-ad</ref>.</p>
  </abstract>
  <ERROR class="undefined">\useunder</ERROR>
  <para xml:id="p1">
    <p><text class="ltx_ulem_uline"/><ERROR class="undefined">\ul</ERROR>
<!--  %Example definitions. 
     %__-->

<!--  %Title. 
     %__-->
<!--  %“title–Federated Learning with Data Augmentation for Alzheimer’s Detection from Spontaneous Speech˝ 
     %基于自发语音的联邦AD检测
     %**** icassp.tex Line 25 ****
     %Single address.
     %__--><ERROR class="undefined">\name</ERROR><tabular vattach="middle">
        <tbody>
          <tr>
            <td align="center">Xiao Wei<Math mode="inline" tex="{}^{1,2}" text="^list@(1, 2)" xml:id="p1.m1">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="list"/>
                        <XMRef idref="p1.m1.1"/>
                        <XMRef idref="p1.m1.2"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER" xml:id="p1.m1.1">1</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="p1.m1.2">2</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMath>
              </Math>, Bin Wen<Math mode="inline" tex="{}^{1,2}" text="^list@(1, 2)" xml:id="p1.m2">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="list"/>
                        <XMRef idref="p1.m2.1"/>
                        <XMRef idref="p1.m2.2"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER" xml:id="p1.m2.1">1</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="p1.m2.2">2</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMath>
              </Math>, Yuqin Lin<Math mode="inline" tex="{}^{2,3}" text="^list@(2, 3)" xml:id="p1.m3">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="list"/>
                        <XMRef idref="p1.m3.1"/>
                        <XMRef idref="p1.m3.2"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="p1.m3.1">2</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="3" role="NUMBER" xml:id="p1.m3.2">3</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMath>
              </Math>, Kai Li<Math mode="inline" tex="{}^{2}" text="^2" xml:id="p1.m4">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                  </XMApp>
                </XMath>
              </Math>, Mingyang Gu<Math mode="inline" tex="{}^{1,2}" text="^list@(1, 2)" xml:id="p1.m5">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="list"/>
                        <XMRef idref="p1.m5.1"/>
                        <XMRef idref="p1.m5.2"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER" xml:id="p1.m5.1">1</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="p1.m5.2">2</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMath>
              </Math>,</td>
          </tr>
          <tr>
            <td align="center">Xiaobao Wang<Math mode="inline" tex="{}^{1}" text="^1" xml:id="p1.m6">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                  </XMApp>
                </XMath>
              </Math>, Longbiao Wang<Math mode="inline" tex="{}^{1,4,*}" text="^list@(1, 4, *)" xml:id="p1.m7">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="list"/>
                        <XMRef idref="p1.m7.1"/>
                        <XMRef idref="p1.m7.2"/>
                        <XMRef idref="p1.m7.3"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER" xml:id="p1.m7.1">1</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="4" role="NUMBER" xml:id="p1.m7.2">4</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="times" role="MULOP" xml:id="p1.m7.3">*</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMath>
              </Math>, Jianwu Dang<Math mode="inline" tex="{}^{2,*}" text="^list@(2, *)" xml:id="p1.m8">
                <XMath>
                  <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="list"/>
                        <XMRef idref="p1.m8.1"/>
                        <XMRef idref="p1.m8.2"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="p1.m8.1">2</XMTok>
                        <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                        <XMTok fontsize="70%" meaning="times" role="MULOP" xml:id="p1.m8.2">*</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMath>
              </Math><note role="thanks">* Corresponding authors.</note></td>
          </tr>
        </tbody>
      </tabular>
<ERROR class="undefined">\address</ERROR>
<Math mode="inline" tex="{}^{1}" text="^1" xml:id="p1.m9">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
          </XMApp>
        </XMath>
      </Math>Tianjin Key Laboratory of Cognitive Computing and Application, <break/>College of Intelligence and Computing, Tianjin University, Tianjin, China<break/><Math mode="inline" tex="{}^{2}" text="^2" xml:id="p1.m10">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
          </XMApp>
        </XMath>
      </Math>Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Shenzhen, China<break/><Math mode="inline" tex="{}^{3}" text="^3" xml:id="p1.m11">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="3" role="NUMBER">3</XMTok>
          </XMApp>
        </XMath>
      </Math>College of Computer and Data Science, Fuzhou University, Fuzhou, China<break/><Math mode="inline" tex="{}^{4}" text="^4" xml:id="p1.m12">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="4" role="NUMBER">4</XMTok>
          </XMApp>
        </XMath>
      </Math>Huiyan Technology (Tianjin) Co., Ltd, Tianjin, China
<!--  %“thanks–Thanks to XYZ agency for funding.˝˝ -->
<ERROR class="undefined">\ninept</ERROR>
<ERROR class="undefined">{keywords}</ERROR>
Alzheimer’s Disease Detection, Federated Learning, Data Augmentation, Voice Conversion, Speech and Language Analysis
<!--  %**** icassp.tex Line 50 **** --></p>
  </para>
  <section inlist="toc" xml:id="S1">
    <tags>
      <tag>1</tag>
      <tag role="autoref">section 1</tag>
      <tag role="refnum">1</tag>
      <tag role="typerefnum">§1</tag>
    </tags>
    <title><tag close=" ">1</tag>Introduction</title>
    <para xml:id="S1.p1">
      <p>Alzheimer’s Disease (AD) is a prevalent neurodegenerative disease that severely impacts patients’ memory, cognition and behavior, posing a substantial burden on global public health systems. Due to its irreversible nature and prolonged preclinical stage, early diagnosis is critical for delaying disease progression and improving patients’ quality of life <cite class="ltx_citemacro_cite">[<bibref bibrefs="de2020artificial" separator="," yyseparator=","/>]</cite>. In recent years, owing to the non-invasiveness, low cost and scalability of spontaneous speech analysis, it has emerged as a promising tool for AD screening <cite class="ltx_citemacro_cite">[<bibref bibrefs="yang2022deep" separator="," yyseparator=","/>]</cite>, where deep learning methods can capture subtle signs of cognitive decline, demonstrating significant potential for practical application.</p>
    </para>
    <para xml:id="S1.p2">
      <p>High-performance deep learning diagnostic models relies heavily on large-scale, high-quality annotated speech data, however, the lack of such data is a fundamental bottleneck in current research. This bottleneck reflects a data efficiency dilemma in medical AI, the inherent data scarcity and privacy-related barriers. This challenge manifests at two interrelated levels: first, the absolute lack of efficiency, e.g., data scarcity, collecting AD speech data requires rigorous clinical coordination, resulting in high costs and long cycles, which inherently limits the amount of available data. This seriously constrains the model’s learning capacity, leading to overfitting and weak generalization <cite class="ltx_citemacro_cite">[<bibref bibrefs="luz2021alzheimer" separator="," yyseparator=","/>]</cite>. Second, the relative efficiency loss, e.g., data silos, due to medical privacy regulations and institutional barriers, already scarce data are further fragmented across hospitals and research centers, preventing collaborative integration and scale benefits <cite class="ltx_citemacro_cite">[<bibref bibrefs="teo2024federated" separator="," yyseparator=","/>]</cite>. The two issues reinforce each other, creating a vicious cycle.</p>
    </para>
    <para xml:id="S1.p3">
      <p>Existing studies have yet to systematically address this dilemma. Most mainstream efforts focus on designing more complex models (e.g., <cite class="ltx_citemacro_cite">[<bibref bibrefs="chen2021automatic,braun2024infusing" separator="," yyseparator=","/>]</cite>) to extract more discriminative features, or exploring multi-modal fusion to capture cross-modal interactions (e.g., <cite class="ltx_citemacro_cite">[<bibref bibrefs="cai2023exploring,chatzianastasis2023neural" separator="," yyseparator=","/>]</cite>, ), yet largely overlook the fundamental issue of data efficiency dilemma. Although recent work such as <cite class="ltx_citemacro_cite">[<bibref bibrefs="meerza2022fair,hsu2024cluster,ouyang2023design" separator="," yyseparator=","/>]</cite> has begun to explore federated learning (FL) to mitigate privacy barriers, these approaches only establish a basic collaborative framework without resolving the inherent problem of data scarcity. Moreover, most methods adopt simple late fusion strategies, failing to fully exploit the representational capacity of the data. Another study <cite class="ltx_citemacro_cite">[<bibref bibrefs="kalabakov2024comparative" separator="," yyseparator=","/>]</cite> also highlights that standard FL may under-perform in highly heterogeneous and scarce data environments. None of these methods offers a comprehensive solution to enhance data utilization efficacy.</p>
    </para>
    <figure inlist="lof" labels="LABEL:model LABEL:model:a LABEL:model:b LABEL:model:c" placement="htp" xml:id="S1.F1">
      <tags>
        <tag>Figure 1</tag>
        <tag role="autoref">Figure 1</tag>
        <tag role="refnum">1</tag>
        <tag role="typerefnum">Figure 1</tag>
      </tags>
      <inline-para class="ltx_minipage" vattach="middle" width="143.1pt">
        <para align="center" xml:id="S1.F1.p1">
          <p class="ltx_align_center"><graphics candidates="1a.pdf" graphic="1a.pdf" options="width=433.62pt" xml:id="S1.F1.p1.g1"/></p>
          <p class="ltx_align_center">(a) Data Augmentation</p>
        </para>
      </inline-para>
      <inline-para class="ltx_minipage" vattach="middle" width="121.4pt">
        <para align="center" xml:id="S1.F1.p2">
          <p class="ltx_align_center"><graphics candidates="1b.pdf" graphic="1b.pdf" options="width=433.62pt" xml:id="S1.F1.p2.g1"/></p>
          <p class="ltx_align_center">(b) Federated Learning</p>
        </para>
      </inline-para>
<!--  %**** icassp.tex Line 75 **** -->      <inline-para class="ltx_minipage" vattach="middle" width="143.1pt">
        <para align="center" xml:id="S1.F1.p3">
          <p class="ltx_align_center"><graphics candidates="1c.pdf" graphic="1c.pdf" options="width=433.62pt" xml:id="S1.F1.p3.g1"/></p>
          <p class="ltx_align_center">(c) Cross-Modal Fusion Model</p>
        </para>
      </inline-para>
      <toccaption><tag close=" ">1</tag>Overview of the proposed FAL-AD framework. (a) Cross-category Voice Conversion-based Data Augmentation: generates pathological speech samples by recombining speaker features from one category and content features from another. (b) Adaptive Federated Learning: clients collaboratively train under privacy constraints and select their optimal model from the federation history. (c) Cross-Modal Fusion Model: depicts the word-level alignment and attentive fusion process of acoustic and textual features.</toccaption>
      <caption><tag close=": ">Figure 1</tag>Overview of the proposed FAL-AD framework. (a) Cross-category Voice Conversion-based Data Augmentation: generates pathological speech samples by recombining speaker features from one category and content features from another. (b) Adaptive Federated Learning: clients collaboratively train under privacy constraints and select their optimal model from the federation history. (c) Cross-Modal Fusion Model: depicts the word-level alignment and attentive fusion process of acoustic and textual features.</caption>
    </figure>
    <para xml:id="S1.p4">
      <p>To fundamentally address the data efficiency bottleneck, we propose the <text font="bold">F</text>ederated and <text font="bold">A</text>ugmented <text font="bold">L</text>earning framework for <text font="bold">A</text>lzheimer’s <text font="bold">D</text>isease Detection (<text font="bold">FAL-AD</text>), whose core idea is to comprehensively optimize the data efficiency from generation to utilization. To our knowledge, this is the first work to systematically explore data efficiency. Our contributions are threefold:</p>
      <itemize xml:id="S1.I1">
        <item xml:id="S1.I1.i1">
          <tags>
            <tag>•</tag>
            <tag role="autoref">item </tag>
            <tag role="typerefnum">1st item</tag>
          </tags>
          <para xml:id="S1.I1.i1.p1">
            <p>Absolute efficiency improvement: we construct a cross-category voice conversion-based data augmentation strategy to generate high-quality pathological samples, effectively expanding the data volume and diversity.</p>
          </para>
        </item>
        <item xml:id="S1.I1.i2">
          <tags>
            <tag>•</tag>
            <tag role="autoref">item </tag>
            <tag role="typerefnum">2nd item</tag>
          </tags>
          <para xml:id="S1.I1.i2.p1">
            <p>Collaborative efficiency breakthrough: we design an adaptive federated learning paradigm that establishes a cross-silo collaboration mechanism, connecting isolated data silos into an efficient cooperative network.</p>
          </para>
        </item>
        <item xml:id="S1.I1.i3">
          <tags>
            <tag>•</tag>
            <tag role="autoref">item </tag>
            <tag role="typerefnum">3rd item</tag>
          </tags>
          <para xml:id="S1.I1.i3.p1">
            <p>Representational efficiency optimization: we improve an attention-based cross-modal fusion model to achieve fine-grained alignment and deep interaction, ensuring maximal knowledge acquiring from limited data.</p>
          </para>
        </item>
      </itemize>
    </para>
  </section>
  <section inlist="toc" xml:id="S2">
    <tags>
      <tag>2</tag>
      <tag role="autoref">section 2</tag>
      <tag role="refnum">2</tag>
      <tag role="typerefnum">§2</tag>
    </tags>
    <title><tag close=" ">2</tag>The Federated and Augmented Framework</title>
    <subsection inlist="toc" xml:id="S2.SS1">
      <tags>
        <tag>2.1</tag>
        <tag role="autoref">subsection 2.1</tag>
        <tag role="refnum">2.1</tag>
        <tag role="typerefnum">§2.1</tag>
      </tags>
      <title><tag close=" ">2.1</tag>Overview</title>
<!--  %**** icassp.tex Line 100 **** -->      <para xml:id="S2.SS1.p1">
        <p>Fig.<ref labelref="LABEL:model"/> illustrates the overall framework of our proposed FAL-AD, which consists of three core modules: (1) a data augmentation module, responsible for expanding the training data; (2) a federated learning module, used for collaborative model training; and (3) a cross-modal fusion module, used for final classification decision. Each module is described in detail below.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S2.SS2">
      <tags>
        <tag>2.2</tag>
        <tag role="autoref">subsection 2.2</tag>
        <tag role="refnum">2.2</tag>
        <tag role="typerefnum">§2.2</tag>
      </tags>
      <title><tag close=" ">2.2</tag>Data Augmentation with Cross-category Voice Conversion</title>
      <para xml:id="S2.SS2.p1">
        <p>To address the scarcity of AD speech data, we employ a voice data augmentation strategy. While text-to-speech (TTS) synthesis is commonly used for augmentation, TTS systems are typically trained on healthy speech samples and offer limited controllability. Even after adjusting parameters such as speech rate and pause duration, the synthesized speech inevitably exhibits artificial characteristics and fails to capture the authentic pathological patterns of real AD patients. To more effectively generate disease-relevant vocal manifestations while minimizing interference from non-pathological factors (e.g., speaker identity), we introduce voice conversion (VC) technology, as shown in Fig.<ref labelref="LABEL:model:a"/>(a). This method preserves the linguistic and pathological content of speech while transforming speaker-dependent characteristics, thereby enabling the model to focus on clinically relevant features.</p>
      </para>
      <para xml:id="S2.SS2.p2">
        <p>Specifically, Our voice conversion is implemented using the CosyVoice 2.0B model <cite class="ltx_citemacro_cite">[<bibref bibrefs="du2024cosyvoice" separator="," yyseparator=","/>]</cite>. Let <Math mode="inline" tex="V_{i}=(S_{i},C_{i},Y_{i})" text="V _ i = vector@(S _ i, C _ i, Y _ i)" xml:id="S2.SS2.p2.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">V</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="vector"/>
                    <XMRef idref="S2.SS2.p2.m1.1"/>
                    <XMRef idref="S2.SS2.p2.m1.2"/>
                    <XMRef idref="S2.SS2.p2.m1.3"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S2.SS2.p2.m1.1">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">S</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S2.SS2.p2.m1.2">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">C</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S2.SS2.p2.m1.3">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math> denote an original speech sample, where <Math mode="inline" tex="Y_{i}\in\{\text{AD},\text{CN}\}" text="Y _ i element-of set@([AD], [CN])" xml:id="S2.SS2.p2.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="set"/>
                    <XMRef idref="S2.SS2.p2.m2.1"/>
                    <XMRef idref="S2.SS2.p2.m2.2"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">{</XMTok>
                    <XMText xml:id="S2.SS2.p2.m2.1">AD</XMText>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMText xml:id="S2.SS2.p2.m2.2">CN</XMText>
                    <XMTok role="CLOSE" stretchy="false">}</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math> (CN stands for cognitively normal), <Math mode="inline" tex="S_{i}" text="S _ i" xml:id="S2.SS2.p2.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">S</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math> denotes the speaker characteristics (including timbre and identity-related features), and <Math mode="inline" tex="C_{i}" text="C _ i" xml:id="S2.SS2.p2.m4">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">C</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math> represents the content aspects (including linguistic content, rhythm, pauses, and pathology-related acoustic features). During augmentation,
for source sample <Math mode="inline" tex="V_{i}" text="V _ i" xml:id="S2.SS2.p2.m5">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">V</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math> and target sample <Math mode="inline" tex="V_{j}" text="V _ j" xml:id="S2.SS2.p2.m6">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">V</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
              </XMApp>
            </XMath>
          </Math> with the opposite category (<Math mode="inline" tex="Y_{i}\neq Y_{j}" text="Y _ i not-equals Y _ j" xml:id="S2.SS2.p2.m7">
            <XMath>
              <XMApp>
                <XMTok meaning="not-equals" name="neq" role="RELOP">≠</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>), we employ voice conversion to recombine the speaker characteristics of the target audio <Math mode="inline" tex="S_{j}" text="S _ j" xml:id="S2.SS2.p2.m8">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">S</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
              </XMApp>
            </XMath>
          </Math> with the content features of the source audio <Math mode="inline" tex="C_{i}" text="C _ i" xml:id="S2.SS2.p2.m9">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">C</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math>, thereby generating a new augmented sample:</p>
      </para>
      <para xml:id="S2.SS2.p3">
        <equation xml:id="S2.E1">
          <tags>
            <tag>(1)</tag>
            <tag role="autoref">Equation 1</tag>
            <tag role="refnum">1</tag>
          </tags>
          <Math mode="display" tex="\widetilde{V}_{i,j}=\mathcal{VC}(V_{i},V_{j})=(S_{j},C_{i},Y_{i})," text="(widetilde@(V)) _ (list@(i, j)) = V * C * open-interval@(V _ i, V _ j) = vector@(S _ j, C _ i, Y _ i)" xml:id="S2.E1.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S2.E1.m1.3"/>
                <XMWrap>
                  <XMApp xml:id="S2.E1.m1.3">
                    <XMTok meaning="multirelation"/>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok name="widetilde" role="OVERACCENT">~</XMTok>
                        <XMTok font="italic" role="UNKNOWN">V</XMTok>
                      </XMApp>
                      <XMDual>
                        <XMApp>
                          <XMTok meaning="list"/>
                          <XMRef idref="S2.E1.m1.1"/>
                          <XMRef idref="S2.E1.m1.2"/>
                        </XMApp>
                        <XMWrap>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN" xml:id="S2.E1.m1.1">i</XMTok>
                          <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN" xml:id="S2.E1.m1.2">j</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="caligraphic" role="UNKNOWN">V</XMTok>
                      <XMTok font="caligraphic" role="UNKNOWN">C</XMTok>
                      <XMDual>
                        <XMApp>
                          <XMTok meaning="open-interval"/>
                          <XMRef idref="S2.E1.m1.3.1"/>
                          <XMRef idref="S2.E1.m1.3.2"/>
                        </XMApp>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="S2.E1.m1.3.1">
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">V</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                          </XMApp>
                          <XMTok role="PUNCT">,</XMTok>
                          <XMApp xml:id="S2.E1.m1.3.2">
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">V</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="vector"/>
                        <XMRef idref="S2.E1.m1.3.3"/>
                        <XMRef idref="S2.E1.m1.3.4"/>
                        <XMRef idref="S2.E1.m1.3.5"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp xml:id="S2.E1.m1.3.3">
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">S</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S2.E1.m1.3.4">
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">C</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S2.E1.m1.3.5">
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <p>where <Math mode="inline" tex="\mathcal{VC}(\cdot)" text="V * C * cdot" xml:id="S2.SS2.p3.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMTok font="caligraphic" role="UNKNOWN">V</XMTok>
                <XMTok font="caligraphic" role="UNKNOWN">C</XMTok>
                <XMDual>
                  <XMRef idref="S2.SS2.p3.m1.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S2.SS2.p3.m1.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math> denotes the voice conversion function, <Math mode="inline" tex="Y_{i}\neq Y_{j}" text="Y _ i not-equals Y _ j" xml:id="S2.SS2.p3.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="not-equals" name="neq" role="RELOP">≠</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>, and the label of the augmented sample satisfies <Math mode="inline" tex="\widetilde{Y}_{i,j}=Y_{i}" text="(widetilde@(Y)) _ (list@(i, j)) = Y _ i" xml:id="S2.SS2.p3.m3">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok name="widetilde" role="OVERACCENT">~</XMTok>
                    <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  </XMApp>
                  <XMDual>
                    <XMApp>
                      <XMTok meaning="list"/>
                      <XMRef idref="S2.SS2.p3.m3.1"/>
                      <XMRef idref="S2.SS2.p3.m3.2"/>
                    </XMApp>
                    <XMWrap>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN" xml:id="S2.SS2.p3.m3.1">i</XMTok>
                      <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN" xml:id="S2.SS2.p3.m3.2">j</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">Y</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>. Each sample in the dataset is alternately treated as the target sample, while a sample from the opposite category is randomly selected as the source sample to perform the aforementioned voice conversion.</p>
      </para>
      <para xml:id="S2.SS2.p4">
        <p>Through cross-category voice conversion, we generate both positive and negative class samples for each speaker, significantly enhancing the dataset’s diversity. This strategy also resolves class imbalance issues while ensuring models no longer favor majority classes. Crucially, by maintaining identical speaker timbre across both sample types, the technique forces models to concentrate exclusively on learning pathology-related acoustic features, such as AD-specific prosodic abnormalities and pause patterns, and reduces the risk of model overfitting to irrelevant speaker-specific features. This innovative decoupling of timbre and pathological traits not only ensures precise control over diagnostic characteristics but also substantially improves the model’s accuracy, robustness, and generalization capability to unseen samples.
<!--  %达到了类似对抗训练的作用 --></p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S2.SS3">
      <tags>
        <tag>2.3</tag>
        <tag role="autoref">subsection 2.3</tag>
        <tag role="refnum">2.3</tag>
        <tag role="typerefnum">§2.3</tag>
      </tags>
      <title><tag close=" ">2.3</tag>Federated Learning with Adaptive Model Selection</title>
      <para xml:id="S2.SS3.p1">
        <p>We adopt Federated Learning (FL) as the foundational paradigm for collaborative training, as Fig.<ref labelref="LABEL:model:b"/>(b) shows. In the standard Federated Averaging (FedAvg) procedure <cite class="ltx_citemacro_cite">[<bibref bibrefs="mcmahan2017communication" separator="," yyseparator=","/>]</cite>, the server executes the following steps at each communication round <Math mode="inline" tex="r" text="r" xml:id="S2.SS3.p1.m1">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">r</XMTok>
            </XMath>
          </Math>:</p>
      </para>
      <para xml:id="S2.SS3.p2">
        <enumerate xml:id="S2.I1">
          <item xml:id="S2.I1.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="S2.I1.i1.p1">
              <p><text font="bold">Server Distribution</text>: The central server distributes the current global model parameters <Math mode="inline" tex="\omega^{r}" text="omega ^ r" xml:id="S2.I1.i1.p1.m1">
                  <XMath>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                    </XMApp>
                  </XMath>
                </Math> to all participating clients.
<!--  %**** icassp.tex Line 125 **** --></p>
            </para>
          </item>
          <item xml:id="S2.I1.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="S2.I1.i2.p1">
              <p><text font="bold">Local Client Update</text>: Each client <Math mode="inline" tex="i" text="i" xml:id="S2.I1.i2.p1.m1">
                  <XMath>
                    <XMTok font="italic" role="UNKNOWN">i</XMTok>
                  </XMath>
                </Math> performs local training for <Math mode="inline" tex="E" text="E" xml:id="S2.I1.i2.p1.m2">
                  <XMath>
                    <XMTok font="italic" role="UNKNOWN">E</XMTok>
                  </XMath>
                </Math> epochs using its local dataset <Math mode="inline" tex="\mathcal{D}_{i}" text="D _ i" xml:id="S2.I1.i2.p1.m3">
                  <XMath>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="caligraphic" role="UNKNOWN">D</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                  </XMath>
                </Math>, resulting in a set of local model updates <Math mode="inline" tex="\Delta\omega_{i}^{r}" text="Delta * (omega _ i) ^ r" xml:id="S2.I1.i2.p1.m4">
                  <XMath>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok name="Delta" role="UNKNOWN">Δ</XMTok>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                      </XMApp>
                    </XMApp>
                  </XMath>
                </Math>.</p>
            </para>
          </item>
          <item xml:id="S2.I1.i3">
            <tags>
              <tag>3.</tag>
              <tag role="autoref">item 3</tag>
              <tag role="refnum">3</tag>
              <tag role="typerefnum">item 3</tag>
            </tags>
            <para xml:id="S2.I1.i3.p1">
              <p><text font="bold">Model Aggregation</text>: The server aggregates the received model updates from the clients:</p>
              <equation xml:id="S2.E2">
                <tags>
                  <tag>(2)</tag>
                  <tag role="autoref">Equation 2</tag>
                  <tag role="refnum">2</tag>
                </tags>
                <Math mode="display" tex="\omega^{r+1}\leftarrow\omega^{r}+\sum\nolimits_{i=1}^{M}\frac{n_{i}}{n}\Delta%&#10;\omega_{i}^{r}," text="omega ^ (r + 1) leftarrow omega ^ r + ((sum _ (i = 1)) ^ M)@((n _ i / n) * Delta * (omega _ i) ^ r)" xml:id="S2.E2.m1">
                  <XMath>
                    <XMDual>
                      <XMRef idref="S2.E2.m1.1"/>
                      <XMWrap>
                        <XMApp xml:id="S2.E2.m1.1">
                          <XMTok name="leftarrow" role="ARROW">←</XMTok>
                          <XMApp>
                            <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                            <XMApp>
                              <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMApp>
                            <XMTok meaning="plus" role="ADDOP">+</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMApp>
                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok mathstyle="display" meaning="sum" role="SUMOP">∑</XMTok>
                                  <XMApp>
                                    <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">M</XMTok>
                              </XMApp>
                              <XMApp>
                                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                <XMApp>
                                  <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                    <XMTok font="italic" role="UNKNOWN">n</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" role="UNKNOWN">n</XMTok>
                                </XMApp>
                                <XMTok name="Delta" role="UNKNOWN">Δ</XMTok>
                                <XMApp>
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                                </XMApp>
                              </XMApp>
                            </XMApp>
                          </XMApp>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMath>
                </Math>
              </equation>
              <p>where <Math mode="inline" tex="n=\sum_{i=1}^{M}n_{i}" text="n = ((sum _ (i = 1)) ^ M)@(n _ i)" xml:id="S2.I1.i3.p1.m1">
                  <XMath>
                    <XMApp>
                      <XMTok meaning="equals" role="RELOP">=</XMTok>
                      <XMTok font="italic" role="UNKNOWN">n</XMTok>
                      <XMApp>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok mathstyle="text" meaning="sum" role="SUMOP" scriptpos="post">∑</XMTok>
                            <XMApp>
                              <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">M</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">n</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                      </XMApp>
                    </XMApp>
                  </XMath>
                </Math> denotes the total number of training samples across all clients, and <Math mode="inline" tex="n_{i}" text="n _ i" xml:id="S2.I1.i3.p1.m2">
                  <XMath>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">n</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                  </XMath>
                </Math> is the number of samples on client <Math mode="inline" tex="i" text="i" xml:id="S2.I1.i3.p1.m3">
                  <XMath>
                    <XMTok font="italic" role="UNKNOWN">i</XMTok>
                  </XMath>
                </Math>.</p>
            </para>
          </item>
        </enumerate>
      </para>
      <para xml:id="S2.SS3.p3">
        <p>However, the standard FL framework solely outputs the final aggregated global model <Math mode="inline" tex="\omega^{G}" text="omega ^ G" xml:id="S2.SS3.p3.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">G</XMTok>
              </XMApp>
            </XMath>
          </Math>, potentially overlooking superior intermediate models obtained during training. To mitigate this limitation, we introduce an <text font="bold">adaptive model selection strategy</text>, which operates as follows:</p>
      </para>
      <para xml:id="S2.SS3.p4">
        <enumerate xml:id="S2.I2">
          <item xml:id="S2.I2.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="S2.I2.i1.p1">
              <p><text font="bold">Performance Tracking</text>: In each federated round <Math mode="inline" tex="r" text="r" xml:id="S2.I2.i1.p1.m1">
                  <XMath>
                    <XMTok font="italic" role="UNKNOWN">r</XMTok>
                  </XMath>
                </Math>, after completing local training and before uploading the model to the server, each client <Math mode="inline" tex="i" text="i" xml:id="S2.I2.i1.p1.m2">
                  <XMath>
                    <XMTok font="italic" role="UNKNOWN">i</XMTok>
                  </XMath>
                </Math> evaluates the performance of its local model <Math mode="inline" tex="\omega_{i}^{r}" text="(omega _ i) ^ r" xml:id="S2.I2.i1.p1.m3">
                  <XMath>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                    </XMApp>
                  </XMath>
                </Math> on a local validation set and retains a copy of the model. Concurrently, after the server completes global aggregation and distributes the new global model <Math mode="inline" tex="\omega^{r+1}" text="omega ^ (r + 1)" xml:id="S2.I2.i1.p1.m4">
                  <XMath>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">r</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                  </XMath>
                </Math>, each client downloads this global model and evaluates the performance of the global model on the same local validation set, also saving the result.</p>
            </para>
          </item>
          <item xml:id="S2.I2.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="S2.I2.i2.p1">
              <p><text font="bold">Optimal Model Selection</text>: This process repeats for a predefined number of communication rounds (e.g., <Math mode="inline" tex="R=30" text="R = 30" xml:id="S2.I2.i2.p1.m1">
                  <XMath>
                    <XMApp>
                      <XMTok meaning="equals" role="RELOP">=</XMTok>
                      <XMTok font="italic" role="UNKNOWN">R</XMTok>
                      <XMTok meaning="30" role="NUMBER">30</XMTok>
                    </XMApp>
                  </XMath>
                </Math> rounds). Upon conclusion of the federated training, each client <Math mode="inline" tex="i" text="i" xml:id="S2.I2.i2.p1.m2">
                  <XMath>
                    <XMTok font="italic" role="UNKNOWN">i</XMTok>
                  </XMath>
                </Math> selects the model with the best performance on its local validation set from all saved model snapshots as its ultimate deployment model, denoted <Math mode="inline" tex="\omega_{i}^{*}" text="(omega _ i) ^ *" xml:id="S2.I2.i2.p1.m3">
                  <XMath>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      </XMApp>
                      <XMTok fontsize="70%" meaning="times" role="MULOP">*</XMTok>
                    </XMApp>
                  </XMath>
                </Math>.</p>
            </para>
          </item>
        </enumerate>
      </para>
      <para xml:id="S2.SS3.p5">
        <p>Our adaptive strategy maximizes performance and adaptability by dynamically selecting the optimal model <Math mode="inline" tex="\omega_{i}^{*}" text="(omega _ i) ^ *" xml:id="S2.SS3.p5.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMTok fontsize="70%" meaning="times" role="MULOP">*</XMTok>
              </XMApp>
            </XMath>
          </Math> for each client, whether a personalized local update or a generalized global model. The approach remains simple and efficient, introducing no extra hyper-parameters and adding only minimal evaluation and storage overhead. Each client thus deploys a tailored model <Math mode="inline" tex="\omega_{i}^{*}" text="(omega _ i) ^ *" xml:id="S2.SS3.p5.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" name="omega" role="UNKNOWN">ω</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMTok fontsize="70%" meaning="times" role="MULOP">*</XMTok>
              </XMApp>
            </XMath>
          </Math> that ensures high performance and local relevance.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S2.SS4">
      <tags>
        <tag>2.4</tag>
        <tag role="autoref">subsection 2.4</tag>
        <tag role="refnum">2.4</tag>
        <tag role="typerefnum">§2.4</tag>
      </tags>
      <title><tag close=" ">2.4</tag>Attentive Cross-Modal Fusion Model</title>
      <para xml:id="S2.SS4.p1">
        <p>To effectively integrate information from speech and text modalities, we build upon the CogniAlign <cite class="ltx_citemacro_cite">[<bibref bibrefs="ortiz2025cognialign" separator="," yyseparator=","/>]</cite> framework, as shown in Fig.<ref labelref="LABEL:model:c"/>(c), where we conduct targeted improvements to adapt it for the federated learning environment. The final fusion model achieves fine-grained multimodal interaction through the following mechanisms:</p>
      </para>
      <para xml:id="S2.SS4.p2">
        <p>First, as for feature extraction and alignment, we begin with prosodic information modeling, where the Whisper <cite class="ltx_citemacro_cite">[<bibref bibrefs="radford2023robust" separator="," yyseparator=","/>]</cite> automatic speech recognition (ASR) model generates transcriptions with word-level timestamps. According to the timestamps, three types of pause markers (comma, period, and ellipsis) are inserted into the text sequence to capture pause and disfluency patterns <cite class="ltx_citemacro_cite">[<bibref bibrefs="yuan2020disfluencies" separator="," yyseparator=","/>]</cite>. Subsequently, we use pre-trained models to extract token-level textual features and frame-level acoustic features, and then according to the timestamps, the latter is mean-pooled into word level to complete alignment.</p>
      </para>
      <para xml:id="S2.SS4.p3">
        <p>Second, as for the feature fusion, we employ a gated cross-attention mechanism, implemented through a single-layer Transformer encoder to enable deep interaction between modalities <cite class="ltx_citemacro_cite">[<bibref bibrefs="vaswani2017attention" separator="," yyseparator=","/>]</cite>. Specifically, the audio embeddings serve as the Query while the text embeddings provide the Key and Value. The gated cross-modal attention is computed as:</p>
        <equation xml:id="S2.E3">
          <tags>
            <tag>(3)</tag>
            <tag role="autoref">Equation 3</tag>
            <tag role="refnum">3</tag>
          </tags>
          <Math mode="display" tex="\mathbf{H}_{\text{att}}=\text{Attention}(\mathbf{A},\mathbf{T},\mathbf{T}),%&#10;\quad\mathbf{G}=\sigma(\mathbf{W}_{g}\mathbf{H}_{\text{att}}+\mathbf{b}_{g})," text="formulae@(H _ [att] = [Attention] * vector@(A, T, T), G = sigma * (W _ g * H _ [att] + b _ g))" xml:id="S2.E3.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S2.E3.m1.4"/>
                <XMWrap>
                  <XMDual xml:id="S2.E3.m1.4">
                    <XMApp>
                      <XMTok meaning="formulae"/>
                      <XMRef idref="S2.E3.m1.4.1"/>
                      <XMRef idref="S2.E3.m1.4.2"/>
                    </XMApp>
                    <XMWrap>
                      <XMApp xml:id="S2.E3.m1.4.1">
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
<!--  %**** icassp.tex Line 150 **** -->                          <XMTok font="bold" role="UNKNOWN">H</XMTok>
                          <XMText><text fontsize="70%">att</text></XMText>
                        </XMApp>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMText>Attention</XMText>
                          <XMDual>
                            <XMApp>
                              <XMTok meaning="vector"/>
                              <XMRef idref="S2.E3.m1.1"/>
                              <XMRef idref="S2.E3.m1.2"/>
                              <XMRef idref="S2.E3.m1.3"/>
                            </XMApp>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMTok font="bold" role="UNKNOWN" xml:id="S2.E3.m1.1">A</XMTok>
                              <XMTok role="PUNCT">,</XMTok>
                              <XMTok font="bold" role="UNKNOWN" xml:id="S2.E3.m1.2">T</XMTok>
                              <XMTok role="PUNCT">,</XMTok>
                              <XMTok font="bold" role="UNKNOWN" xml:id="S2.E3.m1.3">T</XMTok>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                      <XMTok role="PUNCT" rpadding="10.0pt">,</XMTok>
                      <XMApp xml:id="S2.E3.m1.4.2">
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMTok font="bold" role="UNKNOWN">G</XMTok>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                          <XMDual>
                            <XMRef idref="S2.E3.m1.4.2.1"/>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="S2.E3.m1.4.2.1">
                                <XMTok meaning="plus" role="ADDOP">+</XMTok>
                                <XMApp>
                                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMTok font="bold" role="UNKNOWN">W</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">g</XMTok>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMTok font="bold" role="UNKNOWN">H</XMTok>
                                    <XMText><text fontsize="70%">att</text></XMText>
                                  </XMApp>
                                </XMApp>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="bold" role="UNKNOWN">b</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">g</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                    </XMWrap>
                  </XMDual>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <equation xml:id="S2.E4">
          <tags>
            <tag>(4)</tag>
            <tag role="autoref">Equation 4</tag>
            <tag role="refnum">4</tag>
          </tags>
          <Math mode="display" tex="\mathbf{H}=\mathbf{G}\odot\mathbf{H}_{\text{att}}+(1-\mathbf{G})\odot\mathbf{A}," text="H = G direct-product H _ [att] + (1 - G) direct-product A" xml:id="S2.E4.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S2.E4.m1.1"/>
                <XMWrap>
                  <XMApp xml:id="S2.E4.m1.1">
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMTok font="bold" role="UNKNOWN">H</XMTok>
                    <XMApp>
                      <XMTok meaning="plus" role="ADDOP">+</XMTok>
                      <XMApp>
                        <XMTok meaning="direct-product" name="odot" role="MULOP">⊙</XMTok>
                        <XMTok font="bold" role="UNKNOWN">G</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="bold" role="UNKNOWN">H</XMTok>
                          <XMText><text fontsize="70%">att</text></XMText>
                        </XMApp>
                      </XMApp>
                      <XMApp>
                        <XMTok meaning="direct-product" name="odot" role="MULOP">⊙</XMTok>
                        <XMDual>
                          <XMRef idref="S2.E4.m1.1.1"/>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S2.E4.m1.1.1">
                              <XMTok meaning="minus" role="ADDOP">-</XMTok>
                              <XMTok meaning="1" role="NUMBER">1</XMTok>
                              <XMTok font="bold" role="UNKNOWN">G</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                        <XMTok font="bold" role="UNKNOWN">A</XMTok>
                      </XMApp>
                    </XMApp>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <p>where <text class="ltx_markedasmath" font="bold">A</text> and <text class="ltx_markedasmath" font="bold">T</text> respectively represent the word-level embedding sequences of audio and text, <Math mode="inline" tex="\sigma" text="sigma" xml:id="S2.SS4.p3.m3">
            <XMath>
              <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
            </XMath>
          </Math> denotes the sigmoid function, <Math mode="inline" tex="\textbf{W}_{g}" text="[W] _ g" xml:id="S2.SS4.p3.m4">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMText><text font="bold">W</text></XMText>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">g</XMTok>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="\textbf{B}_{g}" text="[B] _ g" xml:id="S2.SS4.p3.m5">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMText><text font="bold">B</text></XMText>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">g</XMTok>
              </XMApp>
            </XMath>
          </Math> are trainable parameters, and <Math mode="inline" tex="\odot" text="direct-product" xml:id="S2.SS4.p3.m6">
            <XMath>
              <XMTok meaning="direct-product" name="odot" role="MULOP">⊙</XMTok>
            </XMath>
          </Math> represents element-wise multiplication. And then an attention pooling layer dynamically learns the importance of each token through the learned weights:</p>
        <equation xml:id="S2.E5">
          <tags>
            <tag>(5)</tag>
            <tag role="autoref">Equation 5</tag>
            <tag role="refnum">5</tag>
          </tags>
          <Math mode="display" tex="\alpha_{i}=\mathbf{W}_{a}\mathbf{H}_{i}+\mathbf{b}_{a},\quad w_{i}=\text{%&#10;softmax}(\alpha)_{i}," text="formulae@(alpha _ i = W _ a * H _ i + b _ a, w _ i = [softmax] * alpha _ i)" xml:id="S2.E5.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S2.E5.m1.2"/>
                <XMWrap>
                  <XMDual xml:id="S2.E5.m1.2">
                    <XMApp>
                      <XMTok meaning="formulae"/>
                      <XMRef idref="S2.E5.m1.2.1"/>
                      <XMRef idref="S2.E5.m1.2.2"/>
                    </XMApp>
                    <XMWrap>
                      <XMApp xml:id="S2.E5.m1.2.1">
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" name="alpha" role="UNKNOWN">α</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok meaning="plus" role="ADDOP">+</XMTok>
                          <XMApp>
                            <XMTok meaning="times" role="MULOP">⁢</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="bold" role="UNKNOWN">W</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">a</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="bold" role="UNKNOWN">H</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="bold" role="UNKNOWN">b</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">a</XMTok>
                          </XMApp>
                        </XMApp>
                      </XMApp>
                      <XMTok role="PUNCT" rpadding="10.0pt">,</XMTok>
                      <XMApp xml:id="S2.E5.m1.2.2">
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">w</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMText>softmax</XMText>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMDual>
                              <XMRef idref="S2.E5.m1.1"/>
                              <XMWrap>
                                <XMTok role="OPEN" stretchy="false">(</XMTok>
                                <XMTok font="italic" name="alpha" role="UNKNOWN" xml:id="S2.E5.m1.1">α</XMTok>
                                <XMTok role="CLOSE" stretchy="false">)</XMTok>
                              </XMWrap>
                            </XMDual>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                          </XMApp>
                        </XMApp>
                      </XMApp>
                    </XMWrap>
                  </XMDual>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <equation xml:id="S2.E6">
          <tags>
            <tag>(6)</tag>
            <tag role="autoref">Equation 6</tag>
            <tag role="refnum">6</tag>
          </tags>
          <Math mode="display" tex="\mathbf{h}=\sum\nolimits_{i=1}^{T}w_{i}\mathbf{H}_{i}," text="h = ((sum _ (i = 1)) ^ T)@(w _ i * H _ i)" xml:id="S2.E6.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S2.E6.m1.1"/>
                <XMWrap>
                  <XMApp xml:id="S2.E6.m1.1">
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMTok font="bold" role="UNKNOWN">h</XMTok>
                    <XMApp>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok mathstyle="display" meaning="sum" role="SUMOP">∑</XMTok>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                          </XMApp>
                        </XMApp>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok meaning="times" role="MULOP">⁢</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">w</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="bold" role="UNKNOWN">H</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                        </XMApp>
                      </XMApp>
                    </XMApp>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <p>where <Math mode="inline" tex="\textbf{W}_{a}" text="[W] _ a" xml:id="S2.SS4.p3.m7">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMText><text font="bold">W</text></XMText>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">a</XMTok>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="\textbf{b}_{a}" text="[b] _ a" xml:id="S2.SS4.p3.m8">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMText><text font="bold">b</text></XMText>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">a</XMTok>
              </XMApp>
            </XMath>
          </Math> are trainable parameters. The final classification is performed by a multilayer perceptron (MLP) on the aggregated representation <Math mode="inline" tex="\mathbf{h}" text="h" xml:id="S2.SS4.p3.m9">
            <XMath>
              <XMTok font="bold" role="UNKNOWN">h</XMTok>
            </XMath>
          </Math>.</p>
      </para>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S3">
    <tags>
      <tag>3</tag>
      <tag role="autoref">section 3</tag>
      <tag role="refnum">3</tag>
      <tag role="typerefnum">§3</tag>
    </tags>
    <title><tag close=" ">3</tag>Experiments</title>
    <subsection inlist="toc" xml:id="S3.SS1">
      <tags>
        <tag>3.1</tag>
        <tag role="autoref">subsection 3.1</tag>
        <tag role="refnum">3.1</tag>
        <tag role="typerefnum">§3.1</tag>
      </tags>
      <title><tag close=" ">3.1</tag>Dataset and Preprocessing</title>
      <para xml:id="S3.SS1.p1">
        <p>We experiment with the ADReSSo Challenge dataset <cite class="ltx_citemacro_cite">[<bibref bibrefs="luz2021detecting" separator="," yyseparator=","/>]</cite>, which contains spontaneous speech recordings from 237 subjects (118 AD and 119 CN), describing the ”Cookie Theft” picture. To ensure a fair comparison with existing works <cite class="ltx_citemacro_cite">[<bibref bibrefs="ortiz2025cognialign" separator="," yyseparator=","/>]</cite>, we adopt a five-fold cross-validation strategy. The dataset is randomly partitioned into five mutually exclusive folds, iteratively using one fold as the test set and the remaining four as the training set. The final reported performance metrics are the average results across all five folds. Note that only the training set is augmented. For both local and federated paradigms, the training set is further randomly divided into three parts to serve as the local data for three clients.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S3.SS2">
      <tags>
        <tag>3.2</tag>
        <tag role="autoref">subsection 3.2</tag>
        <tag role="refnum">3.2</tag>
        <tag role="typerefnum">§3.2</tag>
      </tags>
      <title><tag close=" ">3.2</tag>Experimental Setup</title>
      <para xml:id="S3.SS2.p1">
        <p>To comprehensively evaluate the efficacy of the FAL-AD framework, we designed multiple comparative experiments from the perspective of learning paradigms, systematically measuring the trade-off between data collaboration and privacy preservation. Under each learning paradigm, model variants with three input modalities, i.e., audio, text, and multi-modal, so as to thoroughly validate the performance of different modalities under various paradigms.</p>
      </para>
      <para xml:id="S3.SS2.p2">
        <p>In machine learning, Centralized Learning (CL) represents the ideal scenario of data sharing, and serves as the reference for the performance upper bound. Local Learning (LL) simulates the complete data silo scenario, where each client trains independently using only its local data without any collaboration, representing the performance lower bound. Federated Learning (FL) enables collaboration among clients while keeping data localized. This constitutes a rigorous comparative system. Comparing CL with LL quantifies the performance degradation caused by data silos; comparing LL with FL validates the fundamental benefits of federated collaboration; comparing standard FL with our proposed FAL-AD verifies the added value of our data augmentation and adaptive federated strategies.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S3.SS3">
      <tags>
        <tag>3.3</tag>
        <tag role="autoref">subsection 3.3</tag>
        <tag role="refnum">3.3</tag>
        <tag role="typerefnum">§3.3</tag>
      </tags>
      <title><tag close=" ">3.3</tag>Implementation Details</title>
<!--  %**** icassp.tex Line 175 **** -->      <para xml:id="S3.SS3.p1">
        <p>All audios are transcribed using the Whisper-large-v3 model <cite class="ltx_citemacro_cite">[<bibref bibrefs="radford2023robust" separator="," yyseparator=","/>]</cite> to obtain word-level timestamps. The audio is down-sampled to 16kHz. Audio and text features are respectively extracted using the frozen pre-trained models facebook/wav2vec2-base-960h <cite class="ltx_citemacro_cite">[<bibref bibrefs="baevski2020wav2vec" separator="," yyseparator=","/>]</cite> and distilbert-base-uncased <cite class="ltx_citemacro_cite">[<bibref bibrefs="sanh2019distilbert" separator="," yyseparator=","/>]</cite>, with the maximum sequence length set to 200. The multimodal fusion module employs with a hidden dimension of 768 and 12 attention heads. The classifier is a two-layer MLP. The hyper-parameter settings for CL and LL baselines are consistent with CogniAlign. Our FL methods are trained using the AdamW optimizer with a learning rate of 5e-5, weight decay of 0.01, and a fixed batch size of 64, which simulates 3 clients for 30 communication rounds. All hyper-parameters are selected through grid search to ensure optimal model performance.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S3.SS4">
      <tags>
        <tag>3.4</tag>
        <tag role="autoref">subsection 3.4</tag>
        <tag role="refnum">3.4</tag>
        <tag role="typerefnum">§3.4</tag>
      </tags>
      <title><tag close=" ">3.4</tag>Results and Analysis</title>
      <table inlist="lot" labels="LABEL:res1" placement="ht!" xml:id="S3.T1">
        <tags>
          <tag>Table 1</tag>
          <tag role="autoref">Table 1</tag>
          <tag role="refnum">1</tag>
          <tag role="typerefnum">Table 1</tag>
        </tags>
<!--  %“small -->        <toccaption class="ltx_centering"><tag close=" ">1</tag>Performance comparison (Accuracy and F1-Score in %) across different modalities and learning paradigms on the ADReSSo dataset. Results from previous centralized methods are compared against our implementations under Centralized Learning (CL), Local Learning (LL), and Federated Learning (FL) paradigms, both with and without data augmentation (Aug). The best value in each group is marked with an underline, and the global best value across all groups is marked with bold. Note that CL here is the strict reproduction version of CogniAlign using its source code.
</toccaption>
        <caption class="ltx_centering"><tag close=": ">Table 1</tag>Performance comparison (Accuracy and F1-Score in %) across different modalities and learning paradigms on the ADReSSo dataset. Results from previous centralized methods are compared against our implementations under Centralized Learning (CL), Local Learning (LL), and Federated Learning (FL) paradigms, both with and without data augmentation (Aug). The best value in each group is marked with an underline, and the global best value across all groups is marked with bold. Note that CL here is the strict reproduction version of CogniAlign using its source code.
</caption>
        <inline-block align="center" depth="1.9pt" height="154.2pt" width="1185.8pt" xscale="0.97" xtranslate="-18.3pt" yscale="0.97" ytranslate="-2.4pt">
          <tabular class="ltx_guessed_headers" vattach="middle">
            <thead>
              <tr>
                <td align="left" border="r tt" colspan="2" thead="column"><inline-block class="ltx_parbox" vattach="middle" width="42.7pt">
                    <p>Methods →</p>
                    <p>Modals ↓</p>
                  </inline-block></td>
                <td align="center" border="r tt" colspan="4" thead="column">Previous Work (Centralized Paradigm)</td>
                <td align="center" border="tt" colspan="6" thead="column">Our Work</td>
              </tr>
              <tr>
                <td border="r" colspan="2" thead="column"/>
                <td align="center" border="t" thead="column">C-Attn <cite class="ltx_citemacro_cite">[<bibref bibrefs="wang2021modular" separator="," yyseparator=","/>]</cite></td>
                <td align="center" border="t" thead="column">Ying <cite class="ltx_citemacro_cite">[<bibref bibrefs="ying2023multimodal" separator="," yyseparator=","/>]</cite></td>
                <td align="center" border="t" thead="column">Bang <cite class="ltx_citemacro_cite">[<bibref bibrefs="bang2024alzheimer" separator="," yyseparator=","/>]</cite></td>
                <td align="center" border="r t" thead="column">CogniAlign <cite class="ltx_citemacro_cite">[<bibref bibrefs="ortiz2025cognialign" separator="," yyseparator=","/>]</cite></td>
                <td align="center" border="t" thead="column">CL</td>
                <td align="center" border="r t" thead="column">CL+Aug</td>
                <td align="center" border="t" thead="column">LL</td>
                <td align="center" border="r t" thead="column">LL+Aug</td>
                <td align="center" border="t" thead="column">FL</td>
                <td align="center" border="t" thead="column">FL+Aug</td>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="center" border="r t" rowspan="2">Audio</td>
                <td align="center" border="r t">Acc</td>
                <td align="center" border="t">75.30</td>
                <td align="center" border="t">71.20</td>
                <td align="center" border="t">69.01</td>
                <td align="center" border="r t"><text framed="underline">80.12</text></td>
                <td align="center" border="t">74.55</td>
                <td align="center" border="r t"><text framed="underline">79.39</text></td>
                <td align="center" border="t"><text framed="underline">68.69</text></td>
                <td align="center" border="r t">68.08</td>
                <td align="center" border="t">83.84</td>
                <td align="center" border="t"><text font="bold" framed="underline">85.05</text></td>
              </tr>
              <tr>
                <td align="center" border="r">F1</td>
                <td align="center">76.00</td>
                <td align="center">73.10</td>
                <td align="center">70.39</td>
                <td align="center" border="r"><text framed="underline">79.46</text></td>
                <td align="center">73.39</td>
                <td align="center" border="r"><text framed="underline">79.14</text></td>
                <td align="center">65.68</td>
                <td align="center" border="r"><text framed="underline">67.10</text></td>
                <td align="center">83.67</td>
                <td align="center"><text font="bold" framed="underline">84.64</text></td>
              </tr>
              <tr>
                <td align="center" border="r t" rowspan="2">Text</td>
                <td align="center" border="r t">Acc</td>
                <td align="center" border="t">73.50</td>
                <td align="center" border="t">78.90</td>
                <td align="center" border="t">83.10</td>
                <td align="center" border="r t"><text framed="underline">86.77</text></td>
                <td align="center" border="t">84.85</td>
                <td align="center" border="r t"><text framed="underline">86.67</text></td>
                <td align="center" border="t">78.39</td>
                <td align="center" border="r t"><text framed="underline">79.80</text></td>
                <td align="center" border="t">87.68</td>
                <td align="center" border="t"><text font="bold" framed="underline">90.30</text></td>
              </tr>
              <tr>
                <td align="center" border="r">F1</td>
                <td align="center">73.50</td>
                <td align="center">79.00</td>
                <td align="center">83.10</td>
                <td align="center" border="r"><text framed="underline">86.59</text></td>
                <td align="center">84.69</td>
                <td align="center" border="r"><text framed="underline">86.63</text></td>
                <td align="center">77.48</td>
                <td align="center" border="r"><text framed="underline">79.55</text></td>
                <td align="center">87.64</td>
                <td align="center"><text font="bold" framed="underline">90.28</text></td>
              </tr>
              <tr>
                <td align="center" border="bb r t" rowspan="2">Both</td>
                <td align="center" border="r t">Acc</td>
                <td align="center" border="t">77.20</td>
                <td align="center" border="t">83.70</td>
                <td align="center" border="t">87.32</td>
                <td align="center" border="r t"><text framed="underline">90.36</text></td>
                <td align="center" border="t">86.06</td>
                <td align="center" border="r t"><text framed="underline">86.67</text></td>
                <td align="center" border="t">78.59</td>
                <td align="center" border="r t"><text framed="underline">80.61</text></td>
                <td align="center" border="t">89.70</td>
                <td align="center" border="t"><text font="bold" framed="underline">91.52</text></td>
              </tr>
              <tr>
                <td align="center" border="bb r">F1</td>
                <td align="center" border="bb">77.60</td>
                <td align="center" border="bb">83.30</td>
                <td align="center" border="bb">87.25</td>
                <td align="center" border="bb r"><text framed="underline">90.11</text></td>
                <td align="center" border="bb">85.89</td>
                <td align="center" border="bb r"><text framed="underline">86.64</text></td>
                <td align="center" border="bb">77.16</td>
                <td align="center" border="bb r"><text framed="underline">80.35</text></td>
                <td align="center" border="bb">89.65</td>
                <td align="center" border="bb"><text font="bold" framed="underline">91.45</text></td>
              </tr>
            </tbody>
          </tabular>
        </inline-block>
<!--  %**** icassp.tex Line 200 **** -->      </table>
      <table inlist="lot" labels="LABEL:res2" placement="ht!" xml:id="S3.T2">
        <tags>
          <tag>Table 2</tag>
          <tag role="autoref">Table 2</tag>
          <tag role="refnum">2</tag>
          <tag role="typerefnum">Table 2</tag>
        </tags>
        <toccaption class="ltx_centering"><tag close=" ">2</tag>Performance comparison (Accuracy and F1-Score in %) of federated learning algorithms with different model section strategy: standard (sFL), personalized (pFL), and adaptive (aFL) federated learning. The best performance across section strategies is bold, while the best performance across FL algorithms is underlined.</toccaption>
        <caption class="ltx_centering"><tag close=": ">Table 2</tag>Performance comparison (Accuracy and F1-Score in %) of federated learning algorithms with different model section strategy: standard (sFL), personalized (pFL), and adaptive (aFL) federated learning. The best performance across section strategies is bold, while the best performance across FL algorithms is underlined.</caption>
        <inline-block align="center" depth="1.9pt" height="119.7pt" width="292.1pt" xscale="0.95" xtranslate="-7.7pt" yscale="0.95" ytranslate="-3.2pt">
          <tabular class="ltx_guessed_headers" vattach="middle">
            <tbody>
              <tr>
                <td align="center" border="r tt" rowspan="2" thead="row">Algorithms</td>
                <td align="center" border="r tt" colspan="2">sFL</td>
                <td align="center" border="r tt" colspan="2">pFL</td>
                <td align="center" border="tt" colspan="2">aFL</td>
              </tr>
              <tr>
                <td align="center" border="t">Acc</td>
                <td align="center" border="r t">F1</td>
                <td align="center" border="t">Acc</td>
                <td align="center" border="r t">F1</td>
                <td align="center" border="t">Acc</td>
                <td align="center" border="t">F1</td>
              </tr>
              <tr>
                <td align="center" border="r t" thead="row">FedAdam</td>
                <td align="center" border="t"><ERROR class="undefined">\ul</ERROR>90.91</td>
                <td align="center" border="r t"><ERROR class="undefined">\ul</ERROR>90.88</td>
                <td align="center" border="t">90.00</td>
                <td align="center" border="r t">89.97</td>
                <td align="center" border="t"><text font="bold">90.91</text></td>
                <td align="center" border="t"><text font="bold">90.88</text></td>
              </tr>
              <tr>
                <td align="center" border="r t" thead="row">FedAdagrad</td>
                <td align="center" border="t">89.70</td>
                <td align="center" border="r t">89.58</td>
                <td align="center" border="t">88.79</td>
                <td align="center" border="r t">88.74</td>
                <td align="center" border="t"><text font="bold">90.00</text></td>
                <td align="center" border="t"><text font="bold">89.95</text></td>
              </tr>
              <tr>
                <td align="center" border="r t" thead="row">FedYogi</td>
                <td align="center" border="t">87.27</td>
                <td align="center" border="r t">87.26</td>
                <td align="center" border="t">88.48</td>
                <td align="center" border="r t">88.40</td>
                <td align="center" border="t"><text font="bold">88.79</text></td>
                <td align="center" border="t"><text font="bold">88.71</text></td>
              </tr>
              <tr>
                <td align="center" border="r t" thead="row">FedProx</td>
                <td align="center" border="t">89.70</td>
                <td align="center" border="r t">89.63</td>
                <td align="center" border="t">90.60</td>
                <td align="center" border="r t">90.52</td>
                <td align="center" border="t"><text font="bold">90.91</text></td>
                <td align="center" border="t"><text font="bold">90.84</text></td>
              </tr>
              <tr>
                <td align="center" border="bb r t" thead="row">FedAvg</td>
                <td align="center" border="bb t"><ERROR class="undefined">\ul</ERROR>90.91</td>
                <td align="center" border="bb r t">90.85</td>
                <td align="center" border="bb t"><ERROR class="undefined">\ul</ERROR>90.81</td>
                <td align="center" border="bb r t"><ERROR class="undefined">\ul</ERROR>90.75</td>
                <td align="center" border="bb t"><ERROR class="undefined">\ul</ERROR><text font="bold">91.52</text></td>
                <td align="center" border="bb t"><ERROR class="undefined">\ul</ERROR><text font="bold">91.45</text></td>
              </tr>
            </tbody>
          </tabular>
        </inline-block>
      </table>
      <para xml:id="S3.SS4.p1">
        <p>Overall, our proposed FAL-AD framework achieves state-of-the-art performance on the ADReSSo dataset, as shown in Table <ref labelref="LABEL:res1"/>, surpassing all existing centralized benchmarks. The experimental results systematically validate its effectiveness across three key aspects: first, the comparison between CL and LL quantifies the severe performance degradation caused by data isolation, highlighting the necessity of collaborative paradigms. Second, FL effectively mitigates this issue, achieving a performance gain of over 10 percentage points compared to isolated training (LL), demonstrating its capability to break down data silos while preserving privacy. Finally, and most significantly, our FL+Aug approach not only matches but exceeds the performance of its centralized counterpart (CL+Aug), achieving a new multi-modal accuracy benchmark of 91.52% and outperforming previous state-of-the-art methods including CogniAlign (90.36%). This paradigm breakthrough reveals a synergistic effect: federated aggregation acts as a regularizer to reduce overfitting, while data augmentation enhances generalization through expanded diversity.</p>
      </para>
      <para xml:id="S3.SS4.p2">
        <p><text font="bold">Analysis on Data Augmentation:</text> We further analyze the utility of the proposed voice conversion strategy under different learning paradigms. This strategy brings consistent performance improvements in almost all settings. The benefit is most prominent under the Federated Learning (FL) paradigm, boosting multimodal accuracy from 89.70% to 91.52%. This indicates that data augmentation provides richer and more diverse local data ’fuel’ for federated learning, synergizing with the regularization effect of federated aggregation to push model performance to new heights. The gains under the Local Learning (LL) paradigm are also significant (multimodal accuracy increases from 78.59% to 80.61%), confirming its value as an effective regularization in extremely data-scarce environments. Notably, the improvement in CL paradigm is relatively limited (+0.61%), consistent with our previous finding that models trained on sufficient data are more susceptible to overfitting to the subtle biases introduced by augmented data. This contrast, in turn, demonstrates that our federated learning framework is a superior environment for deploying such data augmentation strategies.</p>
      </para>
<!--  %**** icassp.tex Line 225 **** -->      <para xml:id="S3.SS4.p3">
        <p><text font="bold">Analysis on Federated Learning:</text> The performance of our FAL-AD surpasses the strong centralized baselines. This may be because that, while centralized training has access to all data, it can easily lead the model to overfit to specific patterns in the training set. The inherent ”local update &amp; global average” iterative mechanism in the FedAvg algorithm effectively constrains the model optimization path, forcing it to converge to a flatter and more robust optimum for all data distributions, thereby achieving superior generalization performance. In addition, the adaptive model selection strategy ensures each client finally deploys its historical best model, maximizing the potential of models from different communication rounds. As shown in Table <ref labelref="LABEL:res2"/>, while different aggregators exhibit varying affinities for not identically and independently distributed (non-IID) data, our proposed adaptive Federated strategy (aFL) demonstrates significant and consistent advantages. It achieves optimal performance under most configurations, validating its effectiveness as a lightweight, hyperparameter-free personalization solution. Notably, traditional local fine-tuning strategies (pFL) did not bring stable gains, suggesting that mandatory local fine-tuning in heterogeneous data environments may lead the model to deviate from the optimal solution. This proves that aFL possesses strong fault tolerance and stability. The dynamic selection mechanism retains the deployment simplicity of standard FL (sFL) while achieving personalization benefits close to pFL.</p>
      </para>
<!--  %“vfill“pagebreak -->    </subsection>
  </section>
  <section inlist="toc" xml:id="S4">
    <tags>
      <tag>4</tag>
      <tag role="autoref">section 4</tag>
      <tag role="refnum">4</tag>
      <tag role="typerefnum">§4</tag>
    </tags>
    <title><tag close=" ">4</tag>Conclusion</title>
    <para xml:id="S4.p1">
      <p>This paper presents FAL-AD, a novel framework that tackles the data efficiency dilemma in Alzheimer’s disease detection through federated learning and voice conversion-based augmentation. Our approach systematically addresses data scarcity and privacy constraints by enhancing absolute data efficiency through cross-category voice recombination, improving collaborative efficiency via adaptive federated learning with personalized model selection, and optimizing representational efficiency using attentive cross-modal fusion. Experimental results demonstrate that FAL-AD not only overcomes the limitations of data silos but achieves state-of-the-art performance with 91.52% accuracy, surpassing existing centralized baselines while maintaining privacy preservation. In the future, we plan to explore more advanced generative augmentation techniques and sophisticated personalization methods for non-IID data scenarios, to validate the framework’s generalization across multi-institutional datasets.</p>
    </para>
    <pagination role="newpage"/>
<!--  %References should be produced using the bibtex program from suitable 
     %BiBTeX files (here: strings, refs, manuals). The IEEEbib.bst bibliography
     %style file from IEEE produces unsorted bibliography list.v
     %__-->  </section>
  <section inlist="toc" xml:id="S5">
    <tags>
      <tag>5</tag>
      <tag role="autoref">section 5</tag>
      <tag role="refnum">5</tag>
      <tag role="typerefnum">§5</tag>
    </tags>
    <title><tag close=" ">5</tag>Acknowledgments</title>
    <para xml:id="S5.p1">
      <p>This work was supported by the National High-end Talent Support Fund (No. E43301), High-end Talent Matching Fund of the Chinese Academy of Sciences (No. E55304), Guangdong Province Matching Fund for National High-end Talents (No. E47611), and National Natural Science Foundation Foundation of China (No. 62276185 and No. U23B2053).</p>
    </para>
  </section>
  <bibliography xml:id="bib">
    <title>References</title>
    <biblist>
      <bibitem key="de2020artificial" xml:id="bib.bib1">
        <tags>
          <tag>[1]</tag>
          <tag role="autoref">1</tag>
          <tag role="refnum">1</tag>
        </tags>
        <bibblock>
Sofia De la Fuente Garcia, Craig W Ritchie, and Saturnino Luz,
</bibblock>
        <bibblock>“Artificial intelligence, speech, and language processing approaches to monitoring alzheimer’s disease: a systematic review,”
</bibblock>
        <bibblock><text font="italic">Journal of Alzheimer’s Disease</text>, vol. 78, no. 4, pp. 1547–1574, 2020.
</bibblock>
      </bibitem>
      <bibitem key="yang2022deep" xml:id="bib.bib2">
        <tags>
          <tag>[2]</tag>
          <tag role="autoref">2</tag>
          <tag role="refnum">2</tag>
        </tags>
        <bibblock>
Qin Yang, Xin Li, Xinyun Ding, Feiyang Xu, and Zhenhua Ling,
</bibblock>
        <bibblock>“Deep learning-based speech analysis for alzheimer’s disease detection: a literature review,”
</bibblock>
        <bibblock><text font="italic">Alzheimer’s Research &amp; Therapy</text>, vol. 14, no. 1, pp. 186, 2022.
</bibblock>
      </bibitem>
      <bibitem key="luz2021alzheimer" xml:id="bib.bib3">
        <tags>
          <tag>[3]</tag>
          <tag role="autoref">3</tag>
          <tag role="refnum">3</tag>
        </tags>
        <bibblock>
Saturnino Luz, Fasih Haider, Sofia de la Fuente Garcia, Davida Fromm, and Brian MacWhinney,
</bibblock>
        <bibblock>“Alzheimer’s dementia recognition through spontaneous speech,” 2021.
</bibblock>
      </bibitem>
      <bibitem key="teo2024federated" xml:id="bib.bib4">
        <tags>
          <tag>[4]</tag>
          <tag role="autoref">4</tag>
          <tag role="refnum">4</tag>
        </tags>
        <bibblock>
Zhen Ling Teo, Liyuan Jin, Nan Liu, Siqi Li, Di Miao, Xiaoman Zhang, Wei Yan Ng, Ting Fang Tan, Deborah Meixuan Lee, Kai Jie Chua, et al.,
</bibblock>
        <bibblock>“Federated machine learning in healthcare: A systematic review on clinical applications and technical architecture,”
</bibblock>
        <bibblock><text font="italic">Cell Reports Medicine</text>, vol. 5, no. 2, 2024.
</bibblock>
      </bibitem>
      <bibitem key="chen2021automatic" xml:id="bib.bib5">
        <tags>
          <tag>[5]</tag>
          <tag role="autoref">5</tag>
          <tag role="refnum">5</tag>
        </tags>
        <bibblock>
Jun Chen, Jieping Ye, Fengyi Tang, and Jiayu Zhou,
</bibblock>
        <bibblock>“Automatic detection of alzheimer’s disease using spontaneous speech only,”
<!--  %**** icassp.bbl Line 25 **** --></bibblock>
        <bibblock>in <text font="italic">Interspeech</text>, 2021, vol. 2021, p. 3830.
</bibblock>
      </bibitem>
      <bibitem key="braun2024infusing" xml:id="bib.bib6">
        <tags>
          <tag>[6]</tag>
          <tag role="autoref">6</tag>
          <tag role="refnum">6</tag>
        </tags>
        <bibblock>
Franziska Braun, Sebastian P Bayerl, Florian Hoenig, Hartmut Lehfeld, Thomas Hillemacher, Tobias Bocklet, Korbinian Riedhammer, et al.,
</bibblock>
        <bibblock>“Infusing acoustic pause context into text-based dementia assessment,”
</bibblock>
        <bibblock>in <text font="italic">Interspeech</text>, 2024, pp. 1980–1984.
</bibblock>
      </bibitem>
      <bibitem key="cai2023exploring" xml:id="bib.bib7">
        <tags>
          <tag>[7]</tag>
          <tag role="autoref">7</tag>
          <tag role="refnum">7</tag>
        </tags>
        <bibblock>
Hongmin Cai, Xiaoke Huang, Zhengliang Liu, Wenxiong Liao, Haixing Dai, Zihao Wu, Dajiang Zhu, Hui Ren, Quanzheng Li, Tianming Liu, et al.,
</bibblock>
        <bibblock>“Exploring multimodal approaches for alzheimer’s disease detection using patient speech transcript and audio data,”
</bibblock>
        <bibblock><text font="italic">arXiv preprint arXiv:2307.02514</text>, 2023.
</bibblock>
      </bibitem>
      <bibitem key="chatzianastasis2023neural" xml:id="bib.bib8">
        <tags>
          <tag>[8]</tag>
          <tag role="autoref">8</tag>
          <tag role="refnum">8</tag>
        </tags>
        <bibblock>
Michail Chatzianastasis, Loukas Ilias, Dimitris Askounis, and Michalis Vazirgiannis,
</bibblock>
        <bibblock>“Neural architecture search with multimodal fusion methods for diagnosing dementia,”
</bibblock>
        <bibblock>in <text font="italic">ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</text>. IEEE, 2023, pp. 1–5.
</bibblock>
      </bibitem>
      <bibitem key="meerza2022fair" xml:id="bib.bib9">
        <tags>
          <tag>[9]</tag>
          <tag role="autoref">9</tag>
          <tag role="refnum">9</tag>
        </tags>
        <bibblock>
Syed Irfan Ali Meerza, Zhuohang Li, Luyang Liu, Jiaxin Zhang, and Jian Liu,
</bibblock>
        <bibblock>“Fair and privacy-preserving alzheimer’s disease diagnosis based on spontaneous speech analysis via federated learning,”
</bibblock>
        <bibblock>in <text font="italic">2022 44th Annual International Conference of the IEEE Engineering in Medicine &amp; Biology Society (EMBC)</text>. IEEE, 2022, pp. 1362–1365.
</bibblock>
      </bibitem>
      <bibitem key="hsu2024cluster" xml:id="bib.bib10">
        <tags>
          <tag>[10]</tag>
          <tag role="autoref">10</tag>
          <tag role="refnum">10</tag>
        </tags>
        <bibblock>
Wei-Tung Hsu, Chin-Po Chen, Yun-Shao Lin, and Chi-Chun Lee,
</bibblock>
        <bibblock>“A cluster-based personalized federated learning strategy for end-to-end asr of dementia patients,”
<!--  %**** icassp.bbl Line 50 **** --></bibblock>
        <bibblock>in <text font="italic">Proc Interspeech</text>, 2024, vol. 2024, pp. 2450–2454.
</bibblock>
      </bibitem>
      <bibitem key="ouyang2023design" xml:id="bib.bib11">
        <tags>
          <tag>[11]</tag>
          <tag role="autoref">11</tag>
          <tag role="refnum">11</tag>
        </tags>
        <bibblock>
Xiaomin Ouyang,
</bibblock>
        <bibblock>“Design and deployment of multi-modal federated learning systems for alzheimer’s disease monitoring,”
</bibblock>
        <bibblock>in <text font="italic">Proceedings of the 21st Annual International Conference on Mobile Systems, Applications and Services</text>, 2023, pp. 612–614.
</bibblock>
      </bibitem>
      <bibitem key="kalabakov2024comparative" xml:id="bib.bib12">
        <tags>
          <tag>[12]</tag>
          <tag role="autoref">12</tag>
          <tag role="refnum">12</tag>
        </tags>
        <bibblock>
Stefan Kalabakov, Monica Gonzalez-Machorro, Florian Eyben, Björn W Schuller, and Bert Arnrich,
</bibblock>
        <bibblock>“A comparative analysis of federated learning for speech-based cognitive decline detection,”
</bibblock>
        <bibblock>in <text font="italic">Proc. Interspeech 2024</text>, 2024, pp. 2455–2459.
</bibblock>
      </bibitem>
      <bibitem key="du2024cosyvoice" xml:id="bib.bib13">
        <tags>
          <tag>[13]</tag>
          <tag role="autoref">13</tag>
          <tag role="refnum">13</tag>
        </tags>
        <bibblock>
Zhihao Du, Qian Chen, Shiliang Zhang, Kai Hu, Heng Lu, Yexin Yang, Hangrui Hu, Siqi Zheng, Yue Gu, Ziyang Ma, et al.,
</bibblock>
        <bibblock>“Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens,”
</bibblock>
        <bibblock><text font="italic">CoRR</text>, 2024.
</bibblock>
      </bibitem>
      <bibitem key="mcmahan2017communication" xml:id="bib.bib14">
        <tags>
          <tag>[14]</tag>
          <tag role="autoref">14</tag>
          <tag role="refnum">14</tag>
        </tags>
        <bibblock>
Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise Aguera y Arcas,
</bibblock>
        <bibblock>“Communication-efficient learning of deep networks from decentralized data,”
</bibblock>
        <bibblock>in <text font="italic">Artificial intelligence and statistics</text>. PMLR, 2017, pp. 1273–1282.
</bibblock>
      </bibitem>
      <bibitem key="ortiz2025cognialign" xml:id="bib.bib15">
        <tags>
          <tag>[15]</tag>
          <tag role="autoref">15</tag>
          <tag role="refnum">15</tag>
        </tags>
        <bibblock>
David Ortiz-Perez, Manuel Benavent-Lledo, Javier Rodriguez-Juan, Jose Garcia-Rodriguez, and David Tomás,
</bibblock>
        <bibblock>“Cognialign: Word-level multimodal speech alignment with gated cross-attention for alzheimer’s detection,”
<!--  %**** icassp.bbl Line 75 **** --></bibblock>
        <bibblock><text font="italic">Knowledge-Based Systems</text>, vol. 329, pp. 114264, 2025.
</bibblock>
      </bibitem>
      <bibitem key="radford2023robust" xml:id="bib.bib16">
        <tags>
          <tag>[16]</tag>
          <tag role="autoref">16</tag>
          <tag role="refnum">16</tag>
        </tags>
        <bibblock>
Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever,
</bibblock>
        <bibblock>“Robust speech recognition via large-scale weak supervision,”
</bibblock>
        <bibblock>in <text font="italic">International conference on machine learning</text>. PMLR, 2023, pp. 28492–28518.
</bibblock>
      </bibitem>
      <bibitem key="yuan2020disfluencies" xml:id="bib.bib17">
        <tags>
          <tag>[17]</tag>
          <tag role="autoref">17</tag>
          <tag role="refnum">17</tag>
        </tags>
        <bibblock>
Jiahong Yuan, Yuchen Bian, Xingyu Cai, Jiaji Huang, Zheng Ye, and Kenneth Church,
</bibblock>
        <bibblock>“Disfluencies and fine-tuning pre-trained language models for detection of alzheimer’s disease.,”
</bibblock>
        <bibblock>in <text font="italic">Interspeech</text>, 2020, vol. 2020, pp. 2162–6.
</bibblock>
      </bibitem>
      <bibitem key="vaswani2017attention" xml:id="bib.bib18">
        <tags>
          <tag>[18]</tag>
          <tag role="autoref">18</tag>
          <tag role="refnum">18</tag>
        </tags>
        <bibblock>
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, <text font="caligraphic">L</text>ukasz Kaiser, and Illia Polosukhin,
</bibblock>
        <bibblock>“Attention is all you need,”
</bibblock>
        <bibblock><text font="italic">Advances in neural information processing systems</text>, vol. 30, 2017.
</bibblock>
      </bibitem>
      <bibitem key="luz2021detecting" xml:id="bib.bib19">
        <tags>
          <tag>[19]</tag>
          <tag role="autoref">19</tag>
          <tag role="refnum">19</tag>
        </tags>
        <bibblock>
Saturnino Luz, Fasih Haider, Sofia de la Fuente, Davida Fromm, and Brian MacWhinney,
</bibblock>
        <bibblock>“Detecting cognitive decline using speech only: The adresso challenge,”
</bibblock>
        <bibblock>in <text font="italic">INTERSPEECH 2021</text>. ISCA, 2021.
</bibblock>
      </bibitem>
      <bibitem key="baevski2020wav2vec" xml:id="bib.bib20">
        <tags>
          <tag>[20]</tag>
          <tag role="autoref">20</tag>
          <tag role="refnum">20</tag>
        </tags>
        <bibblock>
Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli,
</bibblock>
        <bibblock>“wav2vec 2.0: A framework for self-supervised learning of speech representations,”
<!--  %**** icassp.bbl Line 100 **** --></bibblock>
        <bibblock><text font="italic">Advances in neural information processing systems</text>, vol. 33, pp. 12449–12460, 2020.
</bibblock>
      </bibitem>
      <bibitem key="sanh2019distilbert" xml:id="bib.bib21">
        <tags>
          <tag>[21]</tag>
          <tag role="autoref">21</tag>
          <tag role="refnum">21</tag>
        </tags>
        <bibblock>
V Sanh,
</bibblock>
        <bibblock>“Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter.,”
</bibblock>
        <bibblock>in <text font="italic">Proceedings of Thirty-third Conference on Neural Information Processing Systems (NIPS2019)</text>, 2019.
</bibblock>
      </bibitem>
      <bibitem key="wang2021modular" xml:id="bib.bib22">
        <tags>
          <tag>[22]</tag>
          <tag role="autoref">22</tag>
          <tag role="refnum">22</tag>
        </tags>
        <bibblock>
Ning Wang, Yupeng Cao, Shuai Hao, Zongru Shao, and KP Subbalakshmi,
</bibblock>
        <bibblock>“Modular multi-modal attention network for alzheimer’s disease detection using patient audio and language data.,”
</bibblock>
        <bibblock>in <text font="italic">Interspeech</text>, 2021, vol. 2021, pp. 3835–3839.
</bibblock>
      </bibitem>
      <bibitem key="ying2023multimodal" xml:id="bib.bib23">
        <tags>
          <tag>[23]</tag>
          <tag role="autoref">23</tag>
          <tag role="refnum">23</tag>
        </tags>
        <bibblock>
Yangwei Ying, Tao Yang, and Hong Zhou,
</bibblock>
        <bibblock>“Multimodal fusion for alzheimer’s disease recognition,”
</bibblock>
        <bibblock><text font="italic">Applied Intelligence</text>, vol. 53, no. 12, pp. 16029–16040, 2023.
</bibblock>
      </bibitem>
      <bibitem key="bang2024alzheimer" xml:id="bib.bib24">
        <tags>
          <tag>[24]</tag>
          <tag role="autoref">24</tag>
          <tag role="refnum">24</tag>
        </tags>
        <bibblock>
Jeong-Uk Bang, Seung-Hoon Han, and Byung-Ok Kang,
</bibblock>
        <bibblock>“Alzheimer’s disease recognition from spontaneous speech using large language models,”
</bibblock>
        <bibblock><text font="italic">ETRI Journal</text>, vol. 46, no. 1, pp. 96–105, 2024.
</bibblock>
      </bibitem>
    </biblist>
  </bibliography>
</document>
