<?xml version="1.0" encoding="UTF-8"?>
<?latexml searchpaths="/home/japhy/scienceReplication.artiswrong.com/paper_files/arxiv/2308.10441/latex_extracted"?>
<?latexml class="article" options="10pt,twocolumn,letterpaper"?>
<?latexml package="config/iccv"?>
<?latexml package="config/iccv23"?>
<!--  %Spacing --><!--  %“frenchspacing --><!--  %“medmuskip=2mu   % reduce spacing around binary operators --><!--  %“thickmuskip=3mu % reduce spacing around relational operators --><!--  %“setlength–“abovedisplayskip˝–3pt˝ --><!--  %“setlength–“belowdisplayskip˝–3pt˝ --><!--  %“setlength–“abovecaptionskip˝–3pt˝ --><!--  %“setlength–“belowcaptionskip˝–3pt˝ --><!--  %Acronym --><?latexml RelaxNGSchema="LaTeXML"?>
<document xmlns="http://dlmf.nist.gov/LaTeXML" class="ltx_authors_1line">
  <resource src="LaTeXML.css" type="text/css"/>
  <resource src="ltx-article.css" type="text/css"/>
  <title><text font="typewriter">X-VoE</text>: Measuring eXplanatory Violation of Expectation in Physical Events</title>
  <creator role="author">
    <personname>Bo Dai<Math mode="inline" tex="{}^{1,2}" text="^list@(1, 2)" xml:id="m1">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMDual>
              <XMApp>
                <XMTok meaning="list"/>
                <XMRef idref="m1.1"/>
                <XMRef idref="m1.2"/>
              </XMApp>
              <XMWrap>
                <XMTok fontsize="70%" meaning="1" role="NUMBER" xml:id="m1.1">1</XMTok>
                <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="m1.2">2</XMTok>
              </XMWrap>
            </XMDual>
          </XMApp>
        </XMath>
      </Math>, Linge Wang<Math mode="inline" tex="{}^{3}" text="^3" xml:id="m2">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="3" role="NUMBER">3</XMTok>
          </XMApp>
        </XMath>
      </Math>, Baoxiong Jia<Math mode="inline" tex="{}^{2}" text="^2" xml:id="m3">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
          </XMApp>
        </XMath>
      </Math>, Zeyu Zhang<Math mode="inline" tex="{}^{2}" text="^2" xml:id="m4">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
          </XMApp>
        </XMath>
      </Math>, Song-Chun Zhu<Math mode="inline" tex="{}^{1,2,3}" text="^list@(1, 2, 3)" xml:id="m5">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMDual>
              <XMApp>
                <XMTok meaning="list"/>
                <XMRef idref="m5.1"/>
                <XMRef idref="m5.2"/>
                <XMRef idref="m5.3"/>
              </XMApp>
              <XMWrap>
                <XMTok fontsize="70%" meaning="1" role="NUMBER" xml:id="m5.1">1</XMTok>
                <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="m5.2">2</XMTok>
                <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                <XMTok fontsize="70%" meaning="3" role="NUMBER" xml:id="m5.3">3</XMTok>
              </XMWrap>
            </XMDual>
          </XMApp>
        </XMath>
      </Math>, Chi Zhang<Math mode="inline" tex="{}^{2,\textrm{\Letter}}" text="^list@(2, [\Letter])" xml:id="m6">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMDual>
              <XMApp>
                <XMTok meaning="list"/>
                <XMRef idref="m6.1"/>
                <XMRef idref="m6.2"/>
              </XMApp>
              <XMWrap>
                <XMTok fontsize="70%" meaning="2" role="NUMBER" xml:id="m6.1">2</XMTok>
                <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                <XMText xml:id="m6.2"><ERROR class="undefined">\Letter</ERROR></XMText>
              </XMWrap>
            </XMDual>
          </XMApp>
        </XMath>
      </Math>, Yixin Zhu<Math mode="inline" tex="{}^{4,\textrm{\Letter}}" text="^list@(4, [\Letter])" xml:id="m7">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMDual>
              <XMApp>
                <XMTok meaning="list"/>
                <XMRef idref="m7.1"/>
                <XMRef idref="m7.2"/>
              </XMApp>
              <XMWrap>
                <XMTok fontsize="70%" meaning="4" role="NUMBER" xml:id="m7.1">4</XMTok>
                <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                <XMText xml:id="m7.2"><ERROR class="undefined">\Letter</ERROR></XMText>
              </XMWrap>
            </XMDual>
          </XMApp>
        </XMath>
      </Math>
<break/><tabular vattach="middle">
        <tr>
          <td align="right"><ref class="ltx_url" font="typewriter" fontsize="90%" href="https://github.com/daibopku/X-VoE">https://github.com/daibopku/X-VoE</ref></td>
          <td align="left"><text class="ltx_markedasmath" fontsize="90%"><ERROR class="undefined">\Letter</ERROR></text><text fontsize="90%">  <text font="typewriter">zhangchi@bigai.ai, yixin.zhu@pku.edu.cn</text></text></td>
        </tr>
        <tr>
          <td align="right"><Math mode="inline" tex="{}^{1}" text="^1" xml:id="m9">
              <XMath>
                <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                  <XMTok fontsize="63%" meaning="1" role="NUMBER">1</XMTok>
                </XMApp>
              </XMath>
            </Math><text fontsize="90%"> School of Intelligence Science and Technology, Peking University</text></td>
          <td align="left"><Math mode="inline" tex="{}^{2}" text="^2" xml:id="m10">
              <XMath>
                <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                  <XMTok fontsize="63%" meaning="2" role="NUMBER">2</XMTok>
                </XMApp>
              </XMath>
            </Math><text fontsize="90%"> Beijing Institute for General Artificial Intelligence</text></td>
        </tr>
        <tr>
          <td align="right"><Math mode="inline" tex="{}^{3}" text="^3" xml:id="m11">
              <XMath>
                <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                  <XMTok fontsize="63%" meaning="3" role="NUMBER">3</XMTok>
                </XMApp>
              </XMath>
            </Math><text fontsize="90%"> Department of Automation, Tsinghua University</text></td>
          <td align="left"><Math mode="inline" tex="{}^{4}" text="^4" xml:id="m12">
              <XMath>
                <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
                  <XMTok fontsize="63%" meaning="4" role="NUMBER">4</XMTok>
                </XMApp>
              </XMath>
            </Math><text fontsize="90%"> Institute for Artificial Intelligence, Peking University</text></td>
        </tr>
      </tabular>
</personname>
  </creator>
  <abstract name="Abstract">
    <p>Intuitive physics is pivotal for human understanding of the physical world, enabling prediction and interpretation of events even in infancy. Nonetheless, replicating this level of intuitive physics in artificial intelligence (AI) remains a formidable challenge. This study introduces <text font="typewriter">X-VoE</text>, a comprehensive benchmark dataset, to assess AI agents’ grasp of intuitive physics. Built on the developmental psychology-rooted  <glossaryref inlist="acronym" key="voe" show="long"/> (<glossaryref inlist="acronym" key="voe" show="short"/>) paradigm, <text font="typewriter">X-VoE</text> establishes a higher bar for the explanatory capacities of intuitive physics models. Each <glossaryref inlist="acronym" key="voe" show="short"/> scenario within <text font="typewriter">X-VoE</text> encompasses three distinct settings, probing models’ comprehension of events and their underlying explanations. Beyond model evaluation, we present an explanation-based learning system that captures physics dynamics and infers occluded object states solely from visual sequences, without explicit occlusion labels. Experimental outcomes highlight our model’s alignment with human commonsense when tested against <text font="typewriter">X-VoE</text>. A remarkable feature is our model’s ability to visually expound <glossaryref inlist="acronym" key="voe" show="short"/> events by reconstructing concealed scenes. Concluding, we discuss the findings’ implications and outline future research directions. Through <text font="typewriter">X-VoE</text>, we catalyze the advancement of AI endowed with human-like intuitive physics capabilities.</p>
  </abstract>
  <glossarydefinition inlist="acronym" key="voe">
    <glossaryphrase key="VoE" role="label">VoE</glossaryphrase>
    <glossaryphrase key="VoE" role="short">VoE</glossaryphrase>
    <glossaryphrase key="Violation of Expectation" role="long">Violation of Expectation</glossaryphrase>
    <glossaryphrase key="Violation of Expectation" role="definition">Violation of Expectation</glossaryphrase>
  </glossarydefinition>
  <glossarydefinition inlist="acronym" key="method">
    <glossaryphrase key="XPL" role="label"><text font="typewriter">XPL</text></glossaryphrase>
    <glossaryphrase key="XPL" role="short"><text font="typewriter">XPL</text></glossaryphrase>
    <glossaryphrase key="eXplanation-based Physics Learner" role="long">eXplanation-based Physics Learner</glossaryphrase>
    <glossaryphrase key="eXplanation-based Physics Learner" role="definition">eXplanation-based Physics Learner</glossaryphrase>
  </glossarydefinition>
  <ERROR class="undefined">\iccvfinalcopy</ERROR>
<!--  %**** main.tex Line 25 **** -->  <figure inlist="lof" labels="LABEL:fig:explain" placement="t!" xml:id="S0.F1">
    <tags>
      <tag><text fontsize="90%">Figure 1</text></tag>
      <tag role="autoref">Figure 1</tag>
      <tag role="refnum">1</tag>
      <tag role="typerefnum">Figure 1</tag>
    </tags>
    <graphics class="ltx_centering" graphic="explain" options="width=433.62pt" xml:id="S0.F1.g1"/>
    <toccaption class="ltx_centering"><tag close=" ">1</tag><text font="bold">Evaluation settings in the ball blocking exemplar scenario of <text font="typewriter">X-VoE</text>.</text> The explanation video illustrates potential hidden dynamics. Circles denote no surprise, and exclamation marks indicate surprise. In the predictive setup (S1), a solvable pair is presented without requiring explanation: predicting observed entities’ dynamics suffices to reason about the outcome. In the hypothetical setup (S2), perceiving the direction of outgoing balls might lead to surprise, yet alternate explanations exist—<ERROR class="undefined">\eg</ERROR>, a hidden blocker behind the wall causing ball rebound. However, a random agent’s scores show negligible disparity, necessitating the explicative setup (S3) to discern surprises, demanding explanatory ability absent in predictive-only or random agents.</toccaption>
    <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 1</text></tag><text font="bold" fontsize="90%">Evaluation settings in the ball blocking exemplar scenario of <text font="typewriter">X-VoE</text>.<text font="medium"> The explanation video illustrates potential hidden dynamics. Circles denote no surprise, and exclamation marks indicate surprise. In the predictive setup (S1), a solvable pair is presented without requiring explanation: predicting observed entities’ dynamics suffices to reason about the outcome. In the hypothetical setup (S2), perceiving the direction of outgoing balls might lead to surprise, yet alternate explanations exist—<ERROR class="undefined">\eg</ERROR>, a hidden blocker behind the wall causing ball rebound. However, a random agent’s scores show negligible disparity, necessitating the explicative setup (S3) to discern surprises, demanding explanatory ability absent in predictive-only or random agents.</text></text></caption>
  </figure>
  <section inlist="toc" xml:id="S1">
    <tags>
      <tag>1</tag>
      <tag role="autoref">section 1</tag>
      <tag role="refnum">1</tag>
      <tag role="typerefnum">§1</tag>
    </tags>
    <title><tag close=" ">1</tag>Introduction</title>
    <para xml:id="S1.p1">
      <p>Humans possess a profound understanding of the physical world, enabling them to predict the outcomes of physical interactions and events <cite class="ltx_citemacro_cite">[<bibref bibrefs="battaglia2013simulation" separator="," yyseparator=","/>]</cite>. From infancy, humans demonstrate intuitive physics, comprehending actions and consequences even in unfamiliar scenarios. For the machine learning community, the challenge lies in emulating this level of intuitive physics understanding. This study introduces <text font="typewriter">X-VoE</text>, a comprehensive benchmark dataset designed to assess and push the limits of AI agents’ intuitive physics comprehension.</p>
    </para>
    <para xml:id="S1.p2">
      <p>The notion of intuitive physics, observed even in young infants, has been foundational in cognitive science and developmental psychology <cite class="ltx_citemacro_cite">[<bibref bibrefs="spelke2007core" separator="," yyseparator=","/>]</cite>. Infants show surprise when physical events violate their expectations, indicating an understanding of fundamental physical principles <cite class="ltx_citemacro_cite">[<bibref bibrefs="baillargeon1985object" separator="," yyseparator=","/>]</cite>. Explanation-based learning has been proposed as a mechanism contributing to the development and refinement of intuitive physics understanding <cite class="ltx_citemacro_cite">[<bibref bibrefs="baillargeon2017explanation" separator="," yyseparator=","/>]</cite>. However, recent advances in this field have primarily resulted in predictive models, lacking the explanatory capacity and falling short of capturing even infant-level intuitive physics comprehension <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive,smith2019modeling" separator="," yyseparator=","/>]</cite>.</p>
    </para>
    <para xml:id="S1.p3">
      <p>Central to our work is the  <glossaryref inlist="acronym" key="voe" show="long"/> (<glossaryref inlist="acronym" key="voe" show="short"/>) paradigm, widely employed in psychological studies to evaluate infants’ intuitive physics understanding <cite class="ltx_citemacro_cite">[<bibref bibrefs="baillargeon1994physical,baillargeon1985object" separator="," yyseparator=","/>]</cite>. In this paradigm, participants exhibit surprise, indicated by prolonged attention, when exposed to events that either follow or violate intuitive physics laws. Inspired by the effectiveness of this paradigm, we adopt it to evaluate AI agents’ intuitive physics comprehension. In each trial, models encounter experiments adhering to or contravening intuitive physics laws. Models succeed in the VoE test if they display high surprise scores for physics-violating experiments and lower scores for compliant ones.</p>
    </para>
    <para xml:id="S1.p4">
      <p>Existing works within the machine learning and computer vision community have embraced the <glossaryref inlist="acronym" key="voe" show="short"/> paradigm <cite class="ltx_citemacro_cite">[<bibref bibrefs="dasgupta2021benchmark,piloto2022intuitive,riochet2020intphys,smith2019modeling,weihs2022benchmarking" separator="," yyseparator=","/>]</cite>. However, most of these efforts primarily focus on predictive abilities, disregarding the explanatory component <cite class="ltx_citemacro_cite">[<bibref bibrefs="aguiar2002developments,piloto2018probing,piloto2022intuitive,riochet2020intphys,smith2019modeling,stahl2015observing" separator="," yyseparator=","/>]</cite>. This perspective neglects the fundamental aspect of <glossaryref inlist="acronym" key="voe" show="short"/>—the act of explaining observed events. In psychological studies, human participants express surprise not at the moment a physics-violating event occurs, but upon learning of its outcome. This observation underscores the significance of explanation within <glossaryref inlist="acronym" key="voe" show="short"/>.</p>
    </para>
    <para xml:id="S1.p5">
      <p>Motivated by these insights, we introduce <text font="typewriter">X-VoE</text>, an intuitive physics evaluation dataset designed specifically to incorporate explanation within <glossaryref inlist="acronym" key="voe" show="short"/>. Distinct from previous efforts that concentrated on predictive scenarios, our dataset encompasses setups that require explaining observed events in diverse <glossaryref inlist="acronym" key="voe" show="short"/> situations. We establish three <glossaryref inlist="acronym" key="voe" show="short"/> settings for each of the four scenarios: ball collision, blocking, object permanence, and continuity (see <ref labelref="LABEL:fig:test" show="creftype~refnum"/>). Each scenario features predictive, hypothetical, and explicative setups. Notably, the three setups within the ball-blocking scenario distinguish explanatory agents from predictive and random ones.</p>
    </para>
    <para xml:id="S1.p6">
      <p>Furthermore, we propose the  <glossaryref inlist="acronym" key="method" show="long"/> (<glossaryref inlist="acronym" key="method" show="short"/>) model to emulate the explanation-based <glossaryref inlist="acronym" key="voe" show="short"/> process, inspired by findings in human studies <cite class="ltx_citemacro_cite">[<bibref bibrefs="baillargeon1994physical,baillargeon2017explanation" separator="," yyseparator=","/>]</cite>. While <glossaryref inlist="acronym" key="method" show="short"/> is adaptable to diverse deep architectures, we specifically build it upon PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite> due to its robust performance. Our model incorporates three self-supervised modules: perception for image encoding, Transformer reasoning for occluded object prediction, and dynamic reasoning for simulating physical dynamics. Importantly, our model introduces a reasoning sub-component to update representations of occluded objects, akin to infants’ explanation-based learning when confronted with unexpected outcomes <cite class="ltx_citemacro_cite">[<bibref bibrefs="baillargeon1994physical" separator="," yyseparator=","/>]</cite>.
<!--  %**** main.tex Line 50 **** --></p>
    </para>
    <para xml:id="S1.p7">
      <p>In summary, our work makes three significant contributions:</p>
      <itemize xml:id="S1.I1">
        <item xml:id="S1.I1.i1">
          <tags>
            <tag>•</tag>
            <tag role="autoref">item </tag>
            <tag role="typerefnum">1st item</tag>
          </tags>
          <para xml:id="S1.I1.i1.p1">
            <p>Introduction of <text font="typewriter">X-VoE</text>, a comprehensive intuitive physics evaluation dataset that challenges AI agents not only in predictive capabilities but also in their capacity to explain. The dataset covers four distinct scenarios, each with predictive, hypothetical, and explicative setups. This allows for a more comprehensive assessment of intuitive physics understanding within <glossaryref inlist="acronym" key="voe" show="short"/>.</p>
          </para>
        </item>
        <item xml:id="S1.I1.i2">
          <tags>
            <tag>•</tag>
            <tag role="autoref">item </tag>
            <tag role="typerefnum">2nd item</tag>
          </tags>
          <para xml:id="S1.I1.i2.p1">
            <p>Proposition of the <glossaryref inlist="acronym" key="method" show="short"/> model, enhancing existing approaches with an explanatory module that improves <glossaryref inlist="acronym" key="voe" show="short"/> evaluation. Our model comprises three modules—perception, reasoning, and dynamics learning—for holistic comprehension and simulation of physical dynamics.</p>
          </para>
        </item>
        <item xml:id="S1.I1.i3">
          <tags>
            <tag>•</tag>
            <tag role="autoref">item </tag>
            <tag role="typerefnum">3rd item</tag>
          </tags>
          <para xml:id="S1.I1.i3.p1">
            <p>Experimental demonstration of <glossaryref inlist="acronym" key="method" show="short"/>’s enhanced performance in alignment with human commonsense compared to other baselines in <text font="typewriter">X-VoE</text>. Additionally, <glossaryref inlist="acronym" key="method" show="short"/> offers insights into hidden factors, as depicted in <ref labelref="LABEL:fig:explain" show="creftype~refnum"/>.</p>
          </para>
        </item>
      </itemize>
    </para>
    <figure inlist="lof" labels="LABEL:fig:test" placement="t!" xml:id="S1.F2">
      <tags>
        <tag><text fontsize="90%">Figure 2</text></tag>
        <tag role="autoref">Figure 2</tag>
        <tag role="refnum">2</tag>
        <tag role="typerefnum">Figure 2</tag>
      </tags>
      <graphics class="ltx_centering" graphic="test" options="width=433.62pt" xml:id="S1.F2.g1"/>
      <toccaption class="ltx_centering"><tag close=" ">2</tag><text font="bold">Testing scenarios in <text font="typewriter">X-VoE</text>: ball collision, blocking, object permanence, and object continuity.</text> Within each scenario, frames in a testing video are linked by the same setup identification number (<ERROR class="undefined">\eg</ERROR>, S1). Black links denote non-surprising videos, while red links indicate surprising ones. Notably, certain videos require explanation to become non-surprising. For example, in the right S2 branch of the object permanence scenario, three cubes on the floor become non-surprising due to preceding observation of two cubes dropping, suggesting a hidden cube behind the wall.</toccaption>
      <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 2</text></tag><text font="bold" fontsize="90%">Testing scenarios in <text font="typewriter">X-VoE</text>: ball collision, blocking, object permanence, and object continuity.<text font="medium"> Within each scenario, frames in a testing video are linked by the same setup identification number (<ERROR class="undefined">\eg</ERROR>, S1). Black links denote non-surprising videos, while red links indicate surprising ones. Notably, certain videos require explanation to become non-surprising. For example, in the right S2 branch of the object permanence scenario, three cubes on the floor become non-surprising due to preceding observation of two cubes dropping, suggesting a hidden cube behind the wall.</text></text></caption>
    </figure>
  </section>
  <section inlist="toc" xml:id="S2">
    <tags>
      <tag>2</tag>
      <tag role="autoref">section 2</tag>
      <tag role="refnum">2</tag>
      <tag role="typerefnum">§2</tag>
    </tags>
    <title><tag close=" ">2</tag>Related work</title>
    <paragraph inlist="toc" xml:id="S2.SS0.SSS0.Px1">
      <title>Intuitive physics</title>
      <para xml:id="S2.SS0.SSS0.Px1.p1">
        <p>Intuitive physics forms a cornerstone of human cognition, enabling rapid and accurate predictions about moving object trajectories <cite class="ltx_citemacro_cite">[<bibref bibrefs="kubricht2017intuitive" separator="," yyseparator=","/>]</cite>. To evaluate machine understanding in this realm, benchmark datasets have emerged, often focusing on predicting future states <cite class="ltx_citemacro_cite">[<bibref bibrefs="battaglia2013simulation,chang2016compositional,lerer2016learning,wu2017learning,chen2020grounding" separator="," yyseparator=","/>]</cite> or inferring object properties <cite class="ltx_citemacro_cite">[<bibref bibrefs="liang2016inferring,liang2018tracking,sanborn2013reconciling" separator="," yyseparator=","/>]</cite>. These methods predominantly gauge model performance by comparing generated predictions to ground truth.</p>
      </para>
      <para xml:id="S2.SS0.SSS0.Px1.p2">
        <p>More recently, the  <glossaryref inlist="acronym" key="voe" show="long"/> (<glossaryref inlist="acronym" key="voe" show="short"/>) paradigm has garnered attention within the machine learning and computer vision community <cite class="ltx_citemacro_cite">[<bibref bibrefs="dasgupta2021benchmark,piloto2022intuitive,riochet2020intphys,smith2019modeling,weihs2022benchmarking" separator="," yyseparator=","/>]</cite>. Rooted in developmental psychology, the <glossaryref inlist="acronym" key="voe" show="short"/> paradigm quantifies model surprise when presented with events that challenge intuitive physics laws. This perspective provides an alternative angle for assessing intuitive physics understanding. Notably, the IntPhys dataset <cite class="ltx_citemacro_cite">[<bibref bibrefs="riochet2020intphys" separator="," yyseparator=","/>]</cite> pioneered this <glossaryref inlist="acronym" key="voe" show="short"/>-based benchmarking approach. ADEPT <cite class="ltx_citemacro_cite">[<bibref bibrefs="smith2019modeling" separator="," yyseparator=","/>]</cite> introduced a model combining re-rendering and object tracking. PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite> decomposed the learning process into perception and dynamics prediction. Differing from conventional intuitive physics learning, the <glossaryref inlist="acronym" key="voe" show="short"/> paradigm does not rely on absolute ground truth. Instead, it hinges on relative measures of surprise, akin to developmental studies that assume higher responses indicate increased surprise. This emphasizes the role of explanation in <glossaryref inlist="acronym" key="voe" show="short"/>, as demonstrated in <ref labelref="LABEL:fig:explain" show="creftype~refnum"/>. In contrast to prior works that often neglected this vital component, our <text font="typewriter">X-VoE</text> includes scenarios that demand both traditional prediction-based understanding and explanation-based comprehension. Additionally, we propose an explanation-enhanced physics learner, <glossaryref inlist="acronym" key="method" show="short"/>, which achieves improved performance and interpretability by incorporating explanations.</p>
      </para>
    </paragraph>
    <paragraph inlist="toc" xml:id="S2.SS0.SSS0.Px2">
      <title>Video prediction</title>
<!--  %**** main.tex Line 75 **** -->      <para xml:id="S2.SS0.SSS0.Px2.p1">
        <p>The challenge of comprehending videos and making plausible predictions of future states from current observations has been a longstanding problem within computer vision <cite class="ltx_citemacro_cite">[<bibref bibrefs="babaeizadeh2017stochastic,lotter2016deep,mathieu2015deep" separator="," yyseparator=","/>]</cite>, closely connected to the <glossaryref inlist="acronym" key="voe" show="short"/> paradigm. Solving <glossaryref inlist="acronym" key="voe" show="short"/> problems frequently involves predicting future frames for inference and evaluation. However, this prediction task is intricate due to the inherent complexity of modeling real-world dynamics and conditional image synthesis <cite class="ltx_citemacro_cite">[<bibref bibrefs="tian2021good,weissenborn2019scaling" separator="," yyseparator=","/>]</cite>. Within the computer vision community, various architectures have been explored to address these challenges and enhance the quality of generated images <cite class="ltx_citemacro_cite">[<bibref bibrefs="tian2021good,weissenborn2019scaling" separator="," yyseparator=","/>]</cite>. The task is further complicated by the need to model relationships between frames, leading to approaches that integrate spatial transformations over time <cite class="ltx_citemacro_cite">[<bibref bibrefs="finn2016unsupervised,liu2017video,reda2018sdc" separator="," yyseparator=","/>]</cite>. Disentanglement of motion and content has also been pursued <cite class="ltx_citemacro_cite">[<bibref bibrefs="denton2017unsupervised,hsieh2018learning,liu2021emergence,villegas2017decomposing" separator="," yyseparator=","/>]</cite>. More recent efforts involve learning physics-based dynamics from videos and reasoning about unknown factors <cite class="ltx_citemacro_cite">[<bibref bibrefs="guen2020disentangling" separator="," yyseparator=","/>]</cite>. Within <text font="typewriter">X-VoE</text>, we assess the performance of these video prediction models as baseline methods.</p>
      </para>
    </paragraph>
    <paragraph inlist="toc" xml:id="S2.SS0.SSS0.Px3">
      <title>Object-centric dynamics</title>
      <para xml:id="S2.SS0.SSS0.Px3.p1">
        <p>The “vision-as-inverse-graphics” framework and the versatility of physics simulation have led to models based on physics simulation, which offer notable advantages in terms of accuracy and generality <cite class="ltx_citemacro_cite">[<bibref bibrefs="chang2016compositional,riochet2020occlusion" separator="," yyseparator=","/>]</cite>. However, these models are often heavily reliant on specific physics engines, limiting their flexibility. In response, recent works have leveraged graph neural networks and object-centric representations to mitigate this dependence <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive,watters2017visual" separator="," yyseparator=","/>]</cite>. By abstracting irrelevant signals and focusing on objects, these models establish a tighter mapping between visual inputs and physics engines. Further, some models can directly simulate real physics engines <cite class="ltx_citemacro_cite">[<bibref bibrefs="battaglia2013simulation,ding2021dynamic,wu2017learning" separator="," yyseparator=","/>]</cite>. These object-centric dynamics models have demonstrated the ability to capture intricate dynamics. Our approach in <text font="typewriter">X-VoE</text> aligns with this framework, using object-centric representations for downstream computation and reasoning.</p>
      </para>
    </paragraph>
  </section>
  <section inlist="toc" xml:id="S3">
    <tags>
      <tag>3</tag>
      <tag role="autoref">section 3</tag>
      <tag role="refnum">3</tag>
      <tag role="typerefnum">§3</tag>
    </tags>
    <title><tag close=" ">3</tag>Generating <text font="typewriter">X-VoE</text></title>
    <para xml:id="S3.p1">
      <p>Our <text font="typewriter">X-VoE</text> dataset encompasses four distinct scenarios, covering ball collision, ball blocking, object permanence, and object continuity. To evaluate various intuitive physics principles, each scenario, except object permanence, comprises three distinct settings: predictive, hypothetical, and explicative, as illustrated in <ref labelref="LABEL:fig:test" show="creftype~refnum"/>. Within each setting, we create 1,000 procedurally generated scene pairs using Unreal Engine 4. Importantly, <text font="typewriter">X-VoE</text> primarily serves as a test suite for evaluating intuitive physics understanding, with no constraints on model training data.</p>
    </para>
    <subsection inlist="toc" labels="LABEL:sec:test_data" xml:id="S3.SS1">
      <tags>
        <tag>3.1</tag>
        <tag role="autoref">subsection 3.1</tag>
        <tag role="refnum">3.1</tag>
        <tag role="typerefnum">§3.1</tag>
      </tags>
      <title><tag close=" ">3.1</tag>Testing data</title>
      <para xml:id="S3.SS1.p1">
        <p>We generate testing videos that span four key aspects of object dynamics: ball collision, ball blocking, object permanence, and object continuity. Refer to <ref labelref="LABEL:fig:test" show="creftype~refnum"/> for a visual overview.</p>
      </para>
      <paragraph inlist="toc" xml:id="S3.SS1.SSS0.Px1">
        <title>Collision</title>
        <para xml:id="S3.SS1.SSS0.Px1.p1">
          <p>In this scenario, a ball traverses the scene, while an occlusion wall is positioned centrally. In the predictive setting (S1), we design a scenario where a ball of differing color but identical mass stands behind a wall. The incoming ball collides with this hidden ball, resulting in the incoming ball coming to a halt and the concealed ball continuing its trajectory. To introduce <glossaryref inlist="acronym" key="voe" show="short"/> effects, we enable the incoming ball to pass through the hidden ball. In the hypothetical setting (S2), we create a scene featuring a central wall concealing objects behind it. An incoming ball enters the scene from the left and rolls behind the wall. In some cases, an additional ball appears to pass through the wall, while in others, the incoming ball does so. This distinction hinges on whether an unseen ball is situated behind the wall. The explicative setting (S3) closely mirrors the hypothetical setting, but we lift the wall to reveal the concealed scene’s contents.</p>
        </para>
        <figure inlist="lof" labels="LABEL:fig:pipeline" placement="t!" xml:id="S3.F3">
          <tags>
            <tag><text fontsize="90%">Figure 3</text></tag>
            <tag role="autoref">Figure 3</tag>
            <tag role="refnum">3</tag>
            <tag role="typerefnum">Figure 3</tag>
          </tags>
          <graphics class="ltx_centering" graphic="pipeline" options="width=433.62pt" xml:id="S3.F3.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">3</tag><text font="bold">Overview of the <glossaryref inlist="acronym" key="method" show="short"/> model for explanation-based physics learning.</text> The model comprises three key modules: (i) the perception module, responsible for extracting object-centric representation from RGBD videos and segmentation masks; (ii) the reasoning module, utilizing two Transformer networks to infer representations of occluded objects; (iii) the dynamics module, which acquires intuitive physical knowledge and refines reasoning outcomes to align with intuitive physics. Additionally, the inferred object representation can be visualized using the decoder from the perception module, offering a <text font="bold">visual explanation</text> of events occurring behind the wall. Wavy curves indicate masking. Refer to the text for comprehensive details.</toccaption>
          <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 3</text></tag><text font="bold" fontsize="90%">Overview of the <glossaryref inlist="acronym" key="method" show="short"/> model for explanation-based physics learning.<text font="medium"> The model comprises three key modules: (i) the perception module, responsible for extracting object-centric representation from RGBD videos and segmentation masks; (ii) the reasoning module, utilizing two Transformer networks to infer representations of occluded objects; (iii) the dynamics module, which acquires intuitive physical knowledge and refines reasoning outcomes to align with intuitive physics. Additionally, the inferred object representation can be visualized using the decoder from the perception module, offering a </text>visual explanation<text font="medium"> of events occurring behind the wall. Wavy curves indicate masking. Refer to the text for comprehensive details.</text></text></caption>
        </figure>
<!--  %**** main.tex Line 100 **** -->      </paragraph>
      <paragraph inlist="toc" xml:id="S3.SS1.SSS0.Px2">
        <title>Blocking</title>
        <para xml:id="S3.SS1.SSS0.Px2.p1">
          <p>The blocking scenario is conceptually similar to the collision scenario, substituting the hidden ball with a stationary cube. The impact of the incoming ball causes it to rebound upon collision with the cube.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S3.SS1.SSS0.Px3">
        <title>Object permanence</title>
        <para xml:id="S3.SS1.SSS0.Px3.p1">
          <p>Drawing inspiration from developmental psychology literature, we recreate a scenario involving cubes falling to the ground and becoming occluded by a wall. In the predictive setting (S1), we devise a case where a wall descends to an initially vacant ground, followed by three cubes falling behind the wall. To elicit <glossaryref inlist="acronym" key="voe" show="short"/> effects, we raise the wall, revealing fewer than three objects. In the hypothetical setting (S2), the scenario begins with a wall positioned centrally, obscuring objects behind it. Three or two cubes fall behind the wall. When the wall is lifted, the scene consistently features three cubes, even when only two cubes initially fell. This reflects the possibility of one cube being hidden behind the wall from the outset.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S3.SS1.SSS0.Px4">
        <title>Object continuity</title>
        <para xml:id="S3.SS1.SSS0.Px4.p1">
          <p>Motivated by psychology studies <cite class="ltx_citemacro_cite">[<bibref bibrefs="aguiar2002developments" separator="," yyseparator=","/>]</cite>, we introduce a wall with a lower-half window. This setup allows a ball to traverse the scene from one side to the other. The ball becomes occluded when behind the wall, emerges through the window, disappears, and subsequently reappears from the opposite end. The three distinct settings mirror the collision and blocking scenarios. The differentiation between plausible and implausible scenes revolves around whether the ball remains visible upon passing through the window. In the predictive setting (S1), all relevant information is presented at the video’s outset and conclusion, negating the presence of hidden objects. In the hypothetical setting (S2), information is deliberately withheld from the video’s start and finish, necessitating the model’s performance to align with infants <cite class="ltx_citemacro_cite">[<bibref bibrefs="aguiar2002developments" separator="," yyseparator=","/>]</cite>, which involves explaining the existence of two balls. In the explicative setting (S3), the wall is lifted, verifying the absence of an additional ball behind the wall.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="S3.SS2">
      <tags>
        <tag>3.2</tag>
        <tag role="autoref">subsection 3.2</tag>
        <tag role="refnum">3.2</tag>
        <tag role="typerefnum">§3.2</tag>
      </tags>
      <title><tag close=" ">3.2</tag>Training data</title>
      <para xml:id="S3.SS2.p1">
        <p>Though we do not impose constraints on the training data, for this study, we generate data adhering to the same structure as the test scenarios but without <glossaryref inlist="acronym" key="voe" show="short"/> effects. As shown in <ref labelref="LABEL:fig:train" show="creftype~refnum"/>, the training set consists of 100,000 procedurally generated scenes, closely mirroring the scale used for training PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite>. During training, we exclusively present videos following intuitive physics laws, raising the wall at the beginning and end of each video. This approach reduces reasoning complexity, simulating the developmental process where only non-surprising physical events are observed. Consequently, models must unsupervisedly learn from video sequences depicting ordinary scenes, developing intuitive physics understanding necessary for <glossaryref inlist="acronym" key="voe" show="short"/>. Furthermore, for the collision and blocking scenarios, we create videos depicting balls passing through walls without collision or obstruction, demonstrating the unimpeded path behind the wall as shown in <ref labelref="LABEL:fig:train" show="creftype~refnum"/>(a). We also generate scenes similar to the previously described settings but devoid of occlusion walls.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:train" placement="t!" xml:id="S3.F4">
        <tags>
          <tag><text fontsize="90%">Figure 4</text></tag>
          <tag role="autoref">Figure 4</tag>
          <tag role="refnum">4</tag>
          <tag role="typerefnum">Figure 4</tag>
        </tags>
        <graphics class="ltx_centering" graphic="train" options="width=433.62pt" xml:id="S3.F4.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">4</tag><text font="bold">Training scenarios for <text font="typewriter">X-VoE</text>.</text> The timeline progresses from left to right, where each row represents the control, collision, blocking, object permanence, and object continuity groups from top to bottom. Please refer to <ref labelref="LABEL:sec_sup:train_data" show="creftype~refnum"/> for additional details.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 4</text></tag><text font="bold" fontsize="90%">Training scenarios for <text font="typewriter">X-VoE</text>.<text font="medium"> The timeline progresses from left to right, where each row represents the control, collision, blocking, object permanence, and object continuity groups from top to bottom. Please refer to <ref labelref="LABEL:sec_sup:train_data" show="creftype~refnum"/> for additional details.</text></text></caption>
      </figure>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S4">
    <tags>
      <tag>4</tag>
      <tag role="autoref">section 4</tag>
      <tag role="refnum">4</tag>
      <tag role="typerefnum">§4</tag>
    </tags>
    <title><tag close=" ">4</tag> <glossaryref inlist="acronym" key="method" show="long"/> (<glossaryref inlist="acronym" key="method" show="short"/>)</title>
<!--  %**** main.tex Line 125 **** -->    <subsection inlist="toc" xml:id="S4.SS1">
      <tags>
        <tag>4.1</tag>
        <tag role="autoref">subsection 4.1</tag>
        <tag role="refnum">4.1</tag>
        <tag role="typerefnum">§4.1</tag>
      </tags>
      <title><tag close=" ">4.1</tag>Framework</title>
      <para xml:id="S4.SS1.p1">
        <p>Our proposed  <glossaryref inlist="acronym" key="method" show="long"/> (<glossaryref inlist="acronym" key="method" show="short"/>) model draws inspiration from developmental psychology theories concerning infancy. As depicted in <ref labelref="LABEL:fig:pipeline" show="creftype~refnum"/>, the <glossaryref inlist="acronym" key="method" show="short"/> model comprises three key components: (1) a perception module responsible for extracting object-centric representations to facilitate downstream processing, (2) a reasoning module tasked with inferring occluded object states by considering both spatial and temporal contexts, and (3) a dynamics module designed to acquire physical insights and evaluate inference outcomes for occluded objects.</p>
      </para>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px1">
        <title>Perception</title>
        <para xml:id="S4.SS1.SSS0.Px1.p1">
          <p>The perception module is designed to process input RGBD video sequences, represented as <Math mode="inline" tex="\langle x^{0},x^{1},...,x^{T}\rangle" text="list@(x ^ 0, x ^ 1, ldots, x ^ T)" xml:id="S4.SS1.SSS0.Px1.p1.m1">
              <XMath>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="list"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m1.2"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m1.3"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m1.1"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m1.4"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok name="langle" role="OPEN" stretchy="false">⟨</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m1.2">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m1.3">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMTok name="ldots" role="ID" xml:id="S4.SS1.SSS0.Px1.p1.m1.1">…</XMTok>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m1.4">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                    </XMApp>
                    <XMTok name="rangle" role="CLOSE" stretchy="false">⟩</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math>, alongside their corresponding segmentation masks, denoted as <Math mode="inline" tex="\langle m^{0},m^{1},...,m^{T}\rangle" text="list@(m ^ 0, m ^ 1, ldots, m ^ T)" xml:id="S4.SS1.SSS0.Px1.p1.m2">
              <XMath>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="list"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m2.2"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m2.3"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m2.1"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m2.4"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok name="langle" role="OPEN" stretchy="false">⟨</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m2.2">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">m</XMTok>
                      <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m2.3">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">m</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMTok name="ldots" role="ID" xml:id="S4.SS1.SSS0.Px1.p1.m2.1">…</XMTok>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m2.4">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">m</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                    </XMApp>
                    <XMTok name="rangle" role="CLOSE" stretchy="false">⟩</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math>. The masks are generated using a pre-trained segmentation model. Notably, the simplicity of the scenes allows for direct use of ground truth segmentation, as observed in PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite>. For each frame, the perception module employs a Component Variational Autoencoder (Component VAE) <cite class="ltx_citemacro_cite">[<bibref bibrefs="burgess2019monet" separator="," yyseparator=","/>]</cite> to transform each input image into a concealed vector representation <Math mode="inline" tex="\langle z^{0}_{1:K},z^{1}_{1:K},...,z^{T}_{1:K}\rangle" text="list@((z ^ 0) _ (1 colon K), (z ^ 1) _ (1 colon K), ldots, (z ^ T) _ (1 colon K))" xml:id="S4.SS1.SSS0.Px1.p1.m3">
              <XMath>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="list"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m3.2"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m3.3"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m3.1"/>
                    <XMRef idref="S4.SS1.SSS0.Px1.p1.m3.4"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok name="langle" role="OPEN" stretchy="false">⟨</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m3.2">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                        <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m3.3">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMTok name="ldots" role="ID" xml:id="S4.SS1.SSS0.Px1.p1.m3.1">…</XMTok>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S4.SS1.SSS0.Px1.p1.m3.4">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok name="rangle" role="CLOSE" stretchy="false">⟩</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math>, where <Math mode="inline" tex="K" text="K" xml:id="S4.SS1.SSS0.Px1.p1.m4">
              <XMath>
                <XMTok font="italic" role="UNKNOWN">K</XMTok>
              </XMath>
            </Math> represents the object count per frame.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px2">
        <title>Reasoning</title>
        <para xml:id="S4.SS1.SSS0.Px2.p1">
          <p>The reasoning module leverages the object embeddings obtained from the perception module as input and endeavors to enhance scene comprehension by inferring the attributes of occluded objects, whose masks remain vacant due to occlusion. This aspect employs two Transformer models to refine object embeddings and recover hidden objects. Both Transformers adopt flattened spatial-temporal embeddings and apply global attention mechanisms to contextualize information. The first Transformer refines input features of occluded objects to align with a learned dynamics module, producing <Math mode="inline" tex="\tilde{z}" text="tilde@(z)" xml:id="S4.SS1.SSS0.Px2.p1.m1">
              <XMath>
                <XMApp>
                  <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                  <XMTok font="italic" role="UNKNOWN">z</XMTok>
                </XMApp>
              </XMath>
            </Math>. The second Transformer is responsible for recuperating objects concealed within observation sequences of both original and refined features. It’s important to note that object recovery mirrors Masked Autoencoding <cite class="ltx_citemacro_cite">[<bibref bibrefs="he2022masked" separator="," yyseparator=","/>]</cite>, treating a random object as absent and necessitating reconstruction from contextual cues. Drawing from these observations, we train the second Transformer similarly to Masked Autoencoders (MAE).</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px3">
        <title>Dynamics</title>
        <para xml:id="S4.SS1.SSS0.Px3.p1">
          <p>The dynamics module predicts object embeddings <Math mode="inline" tex="\hat{z}_{1:K}^{t+1}" text="((hat@(z)) _ (1 colon K)) ^ (t + 1)" xml:id="S4.SS1.SSS0.Px3.p1.m1">
              <XMath>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                    </XMApp>
                    <XMApp>
                      <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMApp>
                    <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math> in the succeeding frame based on the preceding frame’s refined object embeddings <Math mode="inline" tex="\tilde{z}_{1:K}^{1:t}" text="((tilde@(z)) _ (1 colon K)) ^ (1 colon t)" xml:id="S4.SS1.SSS0.Px3.p1.m2">
              <XMath>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                    </XMApp>
                    <XMApp>
                      <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMApp>
                    <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>. This involves employing the interaction dynamics module introduced in PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite>, supplemented by a residual module. Unlike PLATO, we employ object embeddings subsequent to the reasoning module and jointly train the modules.</p>
        </para>
        <figure inlist="lof" labels="LABEL:fig:result" placement="t!" xml:id="S4.F5">
          <tags>
            <tag><text fontsize="90%">Figure 5</text></tag>
            <tag role="autoref">Figure 5</tag>
            <tag role="refnum">5</tag>
            <tag role="typerefnum">Figure 5</tag>
          </tags>
          <graphics class="ltx_centering" graphic="result" options="width=433.62pt" xml:id="S4.F5.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">5</tag><text font="bold">(a) Performance of different models on <text font="typewriter">X-VoE</text> under the holistic metric.</text> The red line denotes the ideal performance. <text font="bold">(b) PCA with or without residual connection.</text> The first ten principal components are shown. <text font="bold">(c) Results from each score component.</text></toccaption>
          <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 5</text></tag><text font="bold" fontsize="90%">(a) Performance of different models on <text font="typewriter">X-VoE</text> under the holistic metric.<text font="medium"> The red line denotes the ideal performance. </text>(b) PCA with or without residual connection.<text font="medium"> The first ten principal components are shown. </text>(c) Results from each score component.</text></caption>
        </figure>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="S4.SS2">
      <tags>
        <tag>4.2</tag>
        <tag role="autoref">subsection 4.2</tag>
        <tag role="refnum">4.2</tag>
        <tag role="typerefnum">§4.2</tag>
      </tags>
      <title><tag close=" ">4.2</tag>Model training</title>
<!--  %**** main.tex Line 150 **** -->      <para xml:id="S4.SS2.p1">
        <p>Initially, we pre-train the perception module to equip the system with foundational visual capabilities. Precisely, the perception module undergoes pre-training using RGBD images and segmentation masks. Throughout this phase, we segment objects and employ masked images for VAE training. During image reconstruction, depth information assists in calculating object mask details.</p>
      </para>
      <para xml:id="S4.SS2.p2">
        <p>We then train one Transformer and the dynamics module, with latent codes frozen from the perception module, in an end-to-end manner employing the following loss:</p>
        <equationgroup class="ltx_eqn_align" xml:id="A4.EGx1">
          <equation xml:id="S4.Ex1">
            <MathFork>
              <Math tex="\displaystyle\tilde{z}=f_{\text{inf}}(z)" text="tilde@(z) = f _ [inf] * z" xml:id="S4.Ex1.m3">
                <XMath>
                  <XMApp>
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMApp>
                      <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                    </XMApp>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">f</XMTok>
                        <XMText><text fontsize="70%">inf</text></XMText>
                      </XMApp>
                      <XMDual>
                        <XMRef idref="S4.Ex1.m3.1"/>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMTok font="italic" role="UNKNOWN" xml:id="S4.Ex1.m3.1">z</XMTok>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="right"><Math mode="inline" tex="\displaystyle\tilde{z}" text="tilde@(z)" xml:id="S4.Ex1.m1">
                    <XMath>
                      <XMApp>
                        <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                      </XMApp>
                    </XMath>
                  </Math></td>
                <td align="left"><Math mode="inline" tex="\displaystyle=f_{\text{inf}}(z)" text="absent = f _ [inf] * z" xml:id="S4.Ex1.m2">
                    <XMath>
                      <XMApp>
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMTok meaning="absent"/>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">f</XMTok>
                            <XMText><text fontsize="70%">inf</text></XMText>
                          </XMApp>
                          <XMDual>
                            <XMRef idref="S4.Ex1.m2.1"/>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMTok font="italic" role="UNKNOWN" xml:id="S4.Ex1.m2.1">z</XMTok>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
          <equation labels="LABEL:eqn:init" xml:id="S4.E1">
            <tags>
              <tag>(1)</tag>
              <tag role="autoref">Equation 1</tag>
              <tag role="refnum">1</tag>
            </tags>
            <MathFork>
              <Math tex="\displaystyle\mathcal{L}=\norm{f_{\text{dyn}}(\tilde{z}_{1:K}^{0:t})-\tilde{z}%&#10;_{1:K}^{t+1}}_{2}," text="L = [\norm] * f _ [dyn] * ((tilde@(z)) _ (1 colon K)) ^ (0 colon t) - (((tilde@(z)) _ (1 colon K)) ^ (t + 1)) _ 2" xml:id="S4.E1.m3">
                <XMath>
                  <XMDual>
                    <XMRef idref="S4.E1.m3.1"/>
                    <XMWrap>
                      <XMApp xml:id="S4.E1.m3.1">
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                        <XMApp>
                          <XMTok meaning="minus" role="ADDOP">-</XMTok>
                          <XMApp>
                            <XMTok meaning="times" role="MULOP">⁢</XMTok>
                            <ERROR class="undefined">\norm</ERROR>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" role="UNKNOWN">f</XMTok>
                              <XMText><text fontsize="70%">dyn</text></XMText>
                            </XMApp>
                            <XMDual>
                              <XMRef idref="S4.E1.m3.1.1"/>
                              <XMWrap>
                                <XMTok role="OPEN" stretchy="false">(</XMTok>
                                <XMApp xml:id="S4.E1.m3.1.1">
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                    <XMApp>
                                      <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                                    <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMTok role="CLOSE" stretchy="false">)</XMTok>
                              </XMWrap>
                            </XMDual>
                          </XMApp>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                <XMApp>
                                  <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                                  <XMTok font="italic" role="UNKNOWN">z</XMTok>
                                </XMApp>
                                <XMApp>
                                  <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMApp>
                                <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                              </XMApp>
                            </XMApp>
                            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                          </XMApp>
                        </XMApp>
                      </XMApp>
                      <XMTok role="PUNCT">,</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMath>
              </Math>
              <MathBranch>
                <td align="right"><Math mode="inline" tex="\displaystyle\mathcal{L}" text="L" xml:id="S4.E1.m1">
                    <XMath>
                      <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                    </XMath>
                  </Math></td>
                <td align="left"><Math mode="inline" tex="\displaystyle=\norm{f_{\text{dyn}}(\tilde{z}_{1:K}^{0:t})-\tilde{z}_{1:K}^{t+1%&#10;}}_{2}," text="absent = [\norm] * f _ [dyn] * ((tilde@(z)) _ (1 colon K)) ^ (0 colon t) - (((tilde@(z)) _ (1 colon K)) ^ (t + 1)) _ 2" xml:id="S4.E1.m2">
                    <XMath>
                      <XMDual>
                        <XMRef idref="S4.E1.m2.1"/>
                        <XMWrap>
                          <XMApp xml:id="S4.E1.m2.1">
                            <XMTok meaning="equals" role="RELOP">=</XMTok>
                            <XMTok meaning="absent"/>
                            <XMApp>
                              <XMTok meaning="minus" role="ADDOP">-</XMTok>
                              <XMApp>
                                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                <ERROR class="undefined">\norm</ERROR>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                                  <XMText><text fontsize="70%">dyn</text></XMText>
                                </XMApp>
                                <XMDual>
                                  <XMRef idref="S4.E1.m2.1.1"/>
                                  <XMWrap>
                                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                                    <XMApp xml:id="S4.E1.m2.1.1">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                        <XMApp>
                                          <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                                          <XMTok font="italic" role="UNKNOWN">z</XMTok>
                                        </XMApp>
                                        <XMApp>
                                          <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMApp>
                                        <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                                        <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                  </XMWrap>
                                </XMDual>
                              </XMApp>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                    <XMApp>
                                      <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                              </XMApp>
                            </XMApp>
                          </XMApp>
                          <XMTok role="PUNCT">,</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
        </equationgroup>
        <p>Here, the Transformer employs the architecture featured in Aloe <cite class="ltx_citemacro_cite">[<bibref bibrefs="ding2021attention" separator="," yyseparator=","/>]</cite> (<Math mode="inline" tex="f_{\text{inf}}(\cdot)" text="f _ [inf] * cdot" xml:id="S4.SS2.p2.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                  <XMText><text fontsize="70%">inf</text></XMText>
                </XMApp>
                <XMDual>
                  <XMRef idref="S4.SS2.p2.m1.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S4.SS2.p2.m1.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>), while the dynamics prediction module aligns with PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite> (<Math mode="inline" tex="f_{\text{dyn}}(\cdot)" text="f _ [dyn] * cdot" xml:id="S4.SS2.p2.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                  <XMText><text fontsize="70%">dyn</text></XMText>
                </XMApp>
                <XMDual>
                  <XMRef idref="S4.SS2.p2.m2.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S4.SS2.p2.m2.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>). The second Transformer is trained independently using MAE.</p>
      </para>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S5">
    <tags>
      <tag>5</tag>
      <tag role="autoref">section 5</tag>
      <tag role="refnum">5</tag>
      <tag role="typerefnum">§5</tag>
    </tags>
    <title><tag close=" ">5</tag>Experiments</title>
    <para xml:id="S5.p1">
      <p>In this section, we thoroughly evaluate the performance of <glossaryref inlist="acronym" key="method" show="short"/> using our <text font="typewriter">X-VoE</text> dataset across different experimental configurations: predicting future phenomena (predictive setup), interpreting existing phenomena (hypothetical setup), and understanding past occurrences given future conditions (explicative setup). We compare <glossaryref inlist="acronym" key="method" show="short"/> against PhyDNet <cite class="ltx_citemacro_cite">[<bibref bibrefs="guen2020disentangling" separator="," yyseparator=","/>]</cite>, a video prediction model, and PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite> in our <text font="typewriter">X-VoE</text> dataset. These models are evaluated under two different metrics.</p>
    </para>
    <subsection inlist="toc" xml:id="S5.SS1">
      <tags>
        <tag>5.1</tag>
        <tag role="autoref">subsection 5.1</tag>
        <tag role="refnum">5.1</tag>
        <tag role="typerefnum">§5.1</tag>
      </tags>
      <title><tag close=" ">5.1</tag>Defining accuracy and surprise</title>
      <para xml:id="S5.SS1.p1">
        <p>Before delving into different evaluative configurations, we first introduce how accuracy and surprise are formally defined.</p>
      </para>
      <para xml:id="S5.SS1.p2">
        <p>In developmental psychology experiments on <glossaryref inlist="acronym" key="voe" show="short"/>, a surprise was defined by comparing infants’ responses to normal scenes with those that violate expectations. Similar to existing works <cite class="ltx_citemacro_cite">[<bibref bibrefs="smith2019modeling" separator="," yyseparator=","/>]</cite>, we borrow the idea and define the model accuracy as the relative scores between two videos, one that violates intuitive physics laws and another that does not:</p>
        <equation labels="LABEL:eqn:accuracy" xml:id="S5.E2">
          <tags>
            <tag>(2)</tag>
            <tag role="autoref">Equation 2</tag>
            <tag role="refnum">2</tag>
          </tags>
          <Math mode="display" tex="\text{Accuracy}=\frac{1}{N}\sum\mathbb{1}[s_{\text{nor}}&lt;s_{\text{sur}}]," xml:id="S5.E2.m1">
            <XMath>
              <XMText>Accuracy</XMText>
              <XMTok meaning="equals" role="RELOP">=</XMTok>
              <XMApp>
                <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                <XMTok meaning="1" role="NUMBER">1</XMTok>
                <XMTok font="italic" role="UNKNOWN">N</XMTok>
              </XMApp>
              <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
              <XMTok font="blackboard" meaning="1" role="NUMBER">1</XMTok>
              <XMWrap>
                <XMTok role="OPEN" stretchy="false">[</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">s</XMTok>
                  <XMText><text fontsize="70%">nor</text></XMText>
                </XMApp>
                <XMTok meaning="less-than" role="RELOP">&lt;</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">s</XMTok>
                  <XMText><text fontsize="70%">sur</text></XMText>
                </XMApp>
                <XMTok role="CLOSE" stretchy="false">]</XMTok>
              </XMWrap>
              <XMTok role="PUNCT">,</XMTok>
            </XMath>
          </Math>
        </equation>
        <p>where <Math mode="inline" tex="N" text="N" xml:id="S5.SS1.p2.m1">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">N</XMTok>
            </XMath>
          </Math> denotes the total number of such pairs, and <Math mode="inline" tex="s_{\text{nor}}" text="s _ [nor]" xml:id="S5.SS1.p2.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMText><text fontsize="70%">nor</text></XMText>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="s_{\text{sur}}" text="s _ [sur]" xml:id="S5.SS1.p2.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMText><text fontsize="70%">sur</text></XMText>
              </XMApp>
            </XMath>
          </Math> are scores of a normal physics video and one that violates physics, respectively. The scores are computed as the sum of the difference between the inferred results from the observation and that from the dynamics module’s prediction, <ERROR class="undefined">\ie</ERROR>,</p>
        <equation labels="LABEL:eqn:total_score" xml:id="S5.E3">
          <tags>
            <tag>(3)</tag>
            <tag role="autoref">Equation 3</tag>
            <tag role="refnum">3</tag>
          </tags>
          <Math mode="display" tex="s=s_{\text{img}}+s_{\text{dyn}}," text="s = s _ [img] + s _ [dyn]" xml:id="S5.E3.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S5.E3.m1.1"/>
                <XMWrap>
                  <XMApp xml:id="S5.E3.m1.1">
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
<!--  %**** main.tex Line 175 **** -->                    <XMTok font="italic" role="UNKNOWN">s</XMTok>
                    <XMApp>
                      <XMTok meaning="plus" role="ADDOP">+</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMText><text fontsize="70%">img</text></XMText>
                      </XMApp>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMText><text fontsize="70%">dyn</text></XMText>
                      </XMApp>
                    </XMApp>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <p>where</p>
        <equation xml:id="S5.E4">
          <tags>
            <tag>(4)</tag>
            <tag role="autoref">Equation 4</tag>
            <tag role="refnum">4</tag>
          </tags>
          <Math mode="display" tex="s_{\text{img}}=\sum_{t=1}^{T}\ell(\text{I}_{t},\sum_{i}f_{\text{dec}}(\tilde{z%&#10;}^{t}_{i}))," text="s _ [img] = ((sum _ (t = 1)) ^ T)@(ell * open-interval@([I] _ t, (sum _ i)@(f _ [dec] * ((tilde@(z)) ^ t) _ i)))" xml:id="S5.E4.m1">
            <XMath>
              <XMDual>
                <XMRef idref="S5.E4.m1.1"/>
                <XMWrap>
                  <XMApp xml:id="S5.E4.m1.1">
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">s</XMTok>
                      <XMText><text fontsize="70%">img</text></XMText>
                    </XMApp>
                    <XMApp>
                      <XMApp scriptpos="mid">
                        <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                        <XMApp scriptpos="mid">
                          <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                          <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                          </XMApp>
                        </XMApp>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok meaning="times" role="MULOP">⁢</XMTok>
                        <XMTok name="ell" role="UNKNOWN">ℓ</XMTok>
                        <XMDual>
                          <XMApp>
                            <XMTok meaning="open-interval"/>
                            <XMRef idref="S5.E4.m1.1.1"/>
                            <XMRef idref="S5.E4.m1.1.2"/>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S5.E4.m1.1.1">
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMText>I</XMText>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S5.E4.m1.1.2">
                              <XMApp scriptpos="mid">
                                <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                              </XMApp>
                              <XMApp>
                                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                                  <XMText><text fontsize="70%">dec</text></XMText>
                                </XMApp>
                                <XMDual>
                                  <XMRef idref="S5.E4.m1.1.2.1"/>
                                  <XMWrap>
                                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                                    <XMApp xml:id="S5.E4.m1.1.2.1">
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                        <XMApp>
                                          <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                                          <XMTok font="italic" role="UNKNOWN">z</XMTok>
                                        </XMApp>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                      </XMApp>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                    </XMApp>
                                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                  </XMWrap>
                                </XMDual>
                              </XMApp>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                      </XMApp>
                    </XMApp>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>
        </equation>
        <p>and</p>
        <equation xml:id="S5.E5">
          <tags>
            <tag>(5)</tag>
            <tag role="autoref">Equation 5</tag>
            <tag role="refnum">5</tag>
          </tags>
          <Math mode="display" tex="s_{\text{dyn}}=\sum_{t=2}^{T}\ell(\sum_{i}f_{\text{dec}}(\tilde{z}^{t}_{i}),f_%&#10;{\text{dec}}(f_{\text{dyn}}(\tilde{z}^{0:t-1}_{1:K}))." xml:id="S5.E5.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMText><text fontsize="70%">dyn</text></XMText>
              </XMApp>
              <XMTok meaning="equals" role="RELOP">=</XMTok>
              <XMApp scriptpos="mid">
                <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                <XMApp scriptpos="mid">
                  <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                  <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                  <XMApp>
                    <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                  </XMApp>
                </XMApp>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
              </XMApp>
              <XMTok name="ell" role="UNKNOWN">ℓ</XMTok>
              <XMWrap>
                <XMTok role="OPEN" stretchy="false">(</XMTok>
                <XMApp scriptpos="mid">
                  <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                  <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                  <XMText><text fontsize="70%">dec</text></XMText>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok role="PUNCT">,</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                  <XMText><text fontsize="70%">dec</text></XMText>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" role="UNKNOWN">f</XMTok>
                    <XMText><text fontsize="70%">dyn</text></XMText>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok name="tilde" role="OVERACCENT" stretchy="false">~</XMTok>
                          <XMTok font="italic" role="UNKNOWN">z</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                          <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                          </XMApp>
                        </XMApp>
                      </XMApp>
                      <XMApp>
                        <XMTok fontsize="70%" name="colon" role="METARELOP">:</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok role="PERIOD">.</XMTok>
              </XMWrap>
            </XMath>
          </Math>
        </equation>
        <p>Here, <Math mode="inline" tex="f_{\text{dec}}(\cdot)" text="f _ [dec] * cdot" xml:id="S5.SS1.p2.m4">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">f</XMTok>
                  <XMText><text fontsize="70%">dec</text></XMText>
                </XMApp>
                <XMDual>
                  <XMRef idref="S5.SS1.p2.m4.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S5.SS1.p2.m4.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math> denotes the learned decoder in our VAE, and we use MSE loss for <Math mode="inline" tex="\ell(\cdot)" text="ell * cdot" xml:id="S5.SS1.p2.m5">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMTok name="ell" role="UNKNOWN">ℓ</XMTok>
                <XMDual>
                  <XMRef idref="S5.SS1.p2.m5.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S5.SS1.p2.m5.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:comparison" placement="t!" xml:id="S5.F6">
        <tags>
          <tag><text fontsize="90%">Figure 6</text></tag>
          <tag role="autoref">Figure 6</tag>
          <tag role="refnum">6</tag>
          <tag role="typerefnum">Figure 6</tag>
        </tags>
        <graphics class="ltx_centering" graphic="comparison" options="width=433.62pt" xml:id="S5.F6.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">6</tag><text font="bold">Performance of different models on <text font="typewriter">X-VoE</text> under the comparative metric.</text> The red line denotes the ideal performance. The top part shows the absolute comparative values and the bottom part shows the difference from the ideal.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 6</text></tag><text font="bold" fontsize="90%">Performance of different models on <text font="typewriter">X-VoE</text> under the comparative metric.<text font="medium"> The red line denotes the ideal performance. The top part shows the absolute comparative values and the bottom part shows the difference from the ideal.</text></text></caption>
      </figure>
    </subsection>
    <subsection inlist="toc" xml:id="S5.SS2">
      <tags>
        <tag>5.2</tag>
        <tag role="autoref">subsection 5.2</tag>
        <tag role="refnum">5.2</tag>
        <tag role="typerefnum">§5.2</tag>
      </tags>
      <title><tag close=" ">5.2</tag>The holistic metric</title>
      <para xml:id="S5.SS2.p1">
        <p>Similar to Smith <ERROR class="undefined">\etal</ERROR> <cite class="ltx_citemacro_cite">[<bibref bibrefs="smith2019modeling" separator="," yyseparator=","/>]</cite>, we adopt the holistic metric to evaluate <glossaryref inlist="acronym" key="voe" show="short"/> effects in all pairs of unexpected and normal event videos. Ideally, an intuitive physics model should produce higher surprise scores for unexpected events. Formally, the holistic metric is defined as such,</p>
        <equation labels="LABEL:eqn:holistic_metric" xml:id="S5.E6">
          <tags>
            <tag>(6)</tag>
            <tag role="autoref">Equation 6</tag>
            <tag role="refnum">6</tag>
          </tags>
          <Math mode="display" tex="\frac{1}{n_{s}n_{c}}\sum_{i,j}\mathbf{1}[s(x_{i}^{+})&gt;s(x_{j}^{-})]," xml:id="S5.E6.m1">
            <XMath>
              <XMApp>
                <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                <XMTok meaning="1" role="NUMBER">1</XMTok>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                    <XMTok font="italic" role="UNKNOWN">n</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                    <XMTok font="italic" role="UNKNOWN">n</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">c</XMTok>
                  </XMApp>
                </XMApp>
              </XMApp>
              <XMApp scriptpos="mid">
                <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="list"/>
                    <XMRef idref="S5.E6.m1.1"/>
                    <XMRef idref="S5.E6.m1.2"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN" xml:id="S5.E6.m1.1">i</XMTok>
                    <XMTok fontsize="70%" role="PUNCT">,</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN" xml:id="S5.E6.m1.2">j</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
              <XMTok font="bold" meaning="1" role="NUMBER">1</XMTok>
              <XMWrap>
                <XMTok role="OPEN" stretchy="false">[</XMTok>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                    <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok meaning="greater-than" role="RELOP">&gt;</XMTok>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
                    </XMApp>
                    <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok role="CLOSE" stretchy="false">]</XMTok>
              </XMWrap>
              <XMTok role="PUNCT">,</XMTok>
<!--  %**** main.tex Line 200 **** 
 %**** main.tex Line 200 **** -->            </XMath>
          </Math>
        </equation>
        <p>where <Math mode="inline" tex="x_{i}^{+}" text="(x _ i) ^ +" xml:id="S5.SS2.p1.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">x</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="x_{j}^{-}" text="(x _ j) ^ -" xml:id="S5.SS2.p1.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">x</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">j</XMTok>
                </XMApp>
                <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
              </XMApp>
            </XMath>
          </Math> denote the unexpected and normal videos and <Math mode="inline" tex="n_{s}" text="n _ s" xml:id="S5.SS2.p1.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">n</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="n_{c}" text="n _ c" xml:id="S5.SS2.p1.m4">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">n</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">c</XMTok>
              </XMApp>
            </XMath>
          </Math> are the number of unexpected and normal videos. This metric aggregates results from all confounding factors, including interference from colors, shapes, scene complexity, <ERROR class="undefined">\etc</ERROR>. Therefore, it provides a holistic view of models’ understanding of intuitive physics events; models need to judge the unexpectedness of outcomes from the intuitive physics perspective, disentangling all other confounding factors.</p>
      </para>
      <para xml:id="S5.SS2.p2">
        <p>As shown in <ref labelref="LABEL:fig:result" show="creftype~refnum"/> (a), we measure the holistic value on different models on <text font="typewriter">X-VoE</text>. Both <glossaryref inlist="acronym" key="method" show="short"/> and PLATO show better performance in all four testing scenarios, though with a notable gap from perfection. <glossaryref inlist="acronym" key="method" show="short"/> is significantly better than PLATO in the collision, blocking, and permanence, but less so in continuity. We also compare different dynamic modules, with or without residual, in <glossaryref inlist="acronym" key="method" show="short"/>. The results show that the residual connection in the dynamics module plays a critical role in our system, as evidenced by results for collision and blocking. An in-depth analysis from Principal Component Analysis (PCA) in <ref labelref="LABEL:fig:result" show="creftype~refnum"/> (b) shows that after adding the residual connection, the standard deviation in different principal components is particularly reduced, making learning easier.</p>
      </para>
      <para xml:id="S5.SS2.p3">
        <p>To investigate the contribution of each of the two surprise components in <ref labelref="LABEL:eqn:total_score" show="creftype~refnum"/>, we compute the holistic metric from each of them separately. As shown in <ref labelref="LABEL:fig:result" show="creftype~refnum"/> (c), the performance of <Math mode="inline" tex="s_{\text{dyn}}" text="s _ [dyn]" xml:id="S5.SS2.p3.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMText><text fontsize="70%">dyn</text></XMText>
              </XMApp>
            </XMath>
          </Math> is superior to that of <Math mode="inline" tex="s_{\text{img}}" text="s _ [img]" xml:id="S5.SS2.p3.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMText><text fontsize="70%">img</text></XMText>
              </XMApp>
            </XMath>
          </Math> in the collision and blocking scenarios, whereas the performance of <Math mode="inline" tex="s_{\text{img}}" text="s _ [img]" xml:id="S5.SS2.p3.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMText><text fontsize="70%">img</text></XMText>
              </XMApp>
            </XMath>
          </Math> is better in permanence and continuity. This result implies that the violation of physical knowledge plays a more important role in collision and blocking. In contrast, the mismatch from the observation is a more crucial factor for permanence and continuity. Thus, the residuals in <glossaryref inlist="acronym" key="method" show="short"/>, explicitly taking earlier information into computation, could exert a greater influence on the dynamic module and its impact in the collision and blocking scenarios as shown in <ref labelref="LABEL:fig:result" show="creftype~refnum"/> (a).</p>
      </para>
      <para xml:id="S5.SS2.p4">
        <p>The holistic metric only provides a global view of how a model understands intuitive physics. To paint a more complete landscape of a model, we look deeper into the comparative metric in the next section.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S5.SS3">
      <tags>
        <tag>5.3</tag>
        <tag role="autoref">subsection 5.3</tag>
        <tag role="refnum">5.3</tag>
        <tag role="typerefnum">§5.3</tag>
      </tags>
      <title><tag close=" ">5.3</tag>The comparative metric</title>
      <para xml:id="S5.SS3.p1">
        <p>The comparative metric, similar to ones proposed in literature <cite class="ltx_citemacro_cite">[<bibref bibrefs="riochet2020intphys,weihs2022benchmarking" separator="," yyseparator=","/>]</cite>, is calculated in a pair of the unexpected and normal events within one specific setting in each scenario,</p>
        <equation xml:id="S5.E7">
          <tags>
            <tag>(7)</tag>
            <tag role="autoref">Equation 7</tag>
            <tag role="refnum">7</tag>
          </tags>
          <Math mode="display" tex="\frac{1}{n}\sum_{i}\mathbf{1}[s(x_{i}^{+})&gt;s(x_{i}^{-})]," xml:id="S5.E7.m1">
            <XMath>
              <XMApp>
                <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                <XMTok meaning="1" role="NUMBER">1</XMTok>
                <XMTok font="italic" role="UNKNOWN">n</XMTok>
              </XMApp>
              <XMApp scriptpos="mid">
                <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
              <XMTok font="bold" meaning="1" role="NUMBER">1</XMTok>
              <XMWrap>
                <XMTok role="OPEN" stretchy="false">[</XMTok>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                    <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok meaning="greater-than" role="RELOP">&gt;</XMTok>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    </XMApp>
                    <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok role="CLOSE" stretchy="false">]</XMTok>
              </XMWrap>
              <XMTok role="PUNCT">,</XMTok>
            </XMath>
          </Math>
        </equation>
        <p>where <Math mode="inline" tex="x_{i}^{+}" text="(x _ i) ^ +" xml:id="S5.SS3.p1.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">x</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMTok fontsize="70%" meaning="plus" role="ADDOP">+</XMTok>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="x_{i}^{-}" text="(x _ i) ^ -" xml:id="S5.SS3.p1.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">x</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
              </XMApp>
            </XMath>
          </Math> are the two paired videos in each settings and <Math mode="inline" tex="n" text="n" xml:id="S5.SS3.p1.m3">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">n</XMTok>
            </XMath>
          </Math> is the number of such pairs. The comparative metric is also most commonly used in evaluating infants’ intuitive physics knowledge in developmental psychology <cite class="ltx_citemacro_cite">[<bibref bibrefs="baillargeon1985object,lin2020infants" separator="," yyseparator=","/>]</cite>.</p>
      </para>
      <para xml:id="S5.SS3.p2">
        <p>Whereas the holistic metric describes whether an observation sequence is absolutely surprising from a holistic perspective, the comparative metric assesses whether one observation sequence is more surprising than another from a comparative perspective. Although the holistic metric provides an overall perspective, it lacks the detailed results of the three specific cases the comparative metric provides; see <ref labelref="LABEL:fig:explain" show="creftype~refnum"/>. In each scenario in <text font="typewriter">X-VoE</text>, the two videos in the hypothetical setting are likely to occur, while only one of the two videos in the predictive and explicative settings is likely to occur. Therefore, the comparative metric in the hypothetical setting should be ideally 50%, while the metric in the predictive and explicative settings should be ideally 100%.</p>
      </para>
      <para xml:id="S5.SS3.p3">
        <p><ref labelref="LABEL:fig:comparison" show="creftype~refnum"/> shows the comparative values of different models. The results in the predictive setting indicate that current AI systems, even as simple as general video prediction, can easily predict future outcomes accurately for such a simple task. However, when it comes to the setting that requires reasoning and explanation (<ERROR class="undefined">\ie</ERROR>, explicative), only <glossaryref inlist="acronym" key="method" show="short"/> can consistently achieve over 50%. When common predictive models can only predict future occurrences based on past conditions, <glossaryref inlist="acronym" key="method" show="short"/> can reason about the past conditions that lead to the observation, a critical ability necessary for successfully solving the explicative setting.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig::train_viz" placement="t!" xml:id="S5.F7">
        <tags>
          <tag><text fontsize="90%">Figure 7</text></tag>
          <tag role="autoref">Figure 7</tag>
          <tag role="refnum">7</tag>
          <tag role="typerefnum">Figure 7</tag>
        </tags>
        <graphics class="ltx_centering" graphic="train_viz" options="width=433.62pt" xml:id="S5.F7.g1"/>
<!--  %**** main.tex Line 225 **** -->        <toccaption class="ltx_centering"><tag close=" ">7</tag><text font="bold">Training: Visualization of the internal representation in PLATO and <glossaryref inlist="acronym" key="method" show="short"/> during training.</text></toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 7</text></tag><text font="bold" fontsize="90%">Training: Visualization of the internal representation in PLATO and <glossaryref inlist="acronym" key="method" show="short"/> during training.</text></caption>
      </figure>
      <para xml:id="S5.SS3.p4">
        <p>Of these, the hypothetical setting is where we notice the most performance volatility. For the hypothetical setting, both a random-answering human subject and an ideal human subject with perfect understanding would reach 50% accuracy. However, this is exactly why this problem is intriguing for psychologists. From this perspective, a model achieving 50% could mean it is either the worst or best. While in the hypothetical setup, PhyDNet achieves nearly 50%, it can only reach random-level performance in the explicative setting, showing that the model does not understand different possibilities behind the wall.
This is why the explicative setting is so important. The explicative setting provides more new information in the video follow-up than the hypothetical setting. As shown in <ref labelref="LABEL:fig:explain" show="creftype~refnum"/>, the new information will change a possible scene to an impossible scene in the hypothetical setting. The metric gap between the hypothetical setting and explicative setting shows the power of the explanatory abilities. <glossaryref inlist="acronym" key="method" show="short"/> demonstrates this property on both collision and blocking scenarios, especially on the collision scenario, where this gap reaches close to 90%.</p>
      </para>
      <para xml:id="S5.SS3.p5">
        <p>Although the <glossaryref inlist="acronym" key="method" show="short"/> with or without a residual module both have the reasoning module, they still have different explanatory abilities for hypothetical and explicative settings. In collision and blocking tasks, residuals’ presence improves the explicative but not the hypothetical setting. The residual module enhances the connection between two consecutive frames, allowing the reasoning module to better infer the previous state based on the subsequent state. The main difference between the hypothetical and explicative setting is the inclusion of follow-up information. In the explicative setting, the presence of follow-up information enhances the performance of the reasoning module (with residual module) due to more subsequent state information. However, in the hypothetical setting, the absence of follow-up information negatively impacts the module’s performance.</p>
      </para>
      <para xml:id="S5.SS3.p6">
        <p>Overall, <glossaryref inlist="acronym" key="method" show="short"/> improves over previous state-of-the-art but still fares worse on collision and continuity. While developmental psychology experiments have found the ability in infants <cite class="ltx_citemacro_cite">[<bibref bibrefs="aguiar2002developments" separator="," yyseparator=","/>]</cite>, it remains a challenge for AI systems.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S5.SS4">
      <tags>
        <tag>5.4</tag>
        <tag role="autoref">subsection 5.4</tag>
        <tag role="refnum">5.4</tag>
        <tag role="typerefnum">§5.4</tag>
      </tags>
      <title><tag close=" ">5.4</tag>Visualization results</title>
      <figure inlist="lof" labels="LABEL:fig:test_viz" placement="t!" xml:id="S5.F8">
        <tags>
          <tag><text fontsize="90%">Figure 8</text></tag>
          <tag role="autoref">Figure 8</tag>
          <tag role="refnum">8</tag>
          <tag role="typerefnum">Figure 8</tag>
        </tags>
        <graphics class="ltx_centering" graphic="test_viz" options="width=433.62pt" xml:id="S5.F8.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">8</tag><text font="bold">Testing: Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing.</text> This example corresponds to the settings in <ref labelref="LABEL:fig:explain" show="creftypecap~refnum"/>.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 8</text></tag><text font="bold" fontsize="90%">Testing: Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing.<text font="medium"> This example corresponds to the settings in <ref labelref="LABEL:fig:explain" show="creftypecap~refnum"/>.</text></text></caption>
      </figure>
      <para xml:id="S5.SS4.p1">
        <p>The challenge of visual occlusion persists in computer vision. Unless the ground-truth value is given directly, it is difficult to characterize occluded objects by vision alone, especially in the case of complete occlusion. However, humans can deduce occluded objects and corresponding physical phenomena intuitively, even under complete occlusion. We investigate whether <glossaryref inlist="acronym" key="method" show="short"/> can reason about occluded objects through visualization.</p>
      </para>
      <para xml:id="S5.SS4.p2">
        <p>We visualize occluded objects within the learned representation. Specifically, we mask the token associated with the wall and decode the resulting features to assess the model’s ability to reconstruct hidden objects. Training visualization results are presented in <ref labelref="LABEL:fig::train_viz" show="creftype~refnum"/>. Notably, PLATO lacks a dedicated reasoning module for occluded objects, resulting in an inability to recover occluded factors. Conversely, <glossaryref inlist="acronym" key="method" show="short"/> gradually learns to infer the presence of occluded objects behind the wall to explain observations. Crucially, we never provide ground-truth occluded object representations during training, emphasizing the importance of synchronized training of the inference and dynamic modules. This approach allows <glossaryref inlist="acronym" key="method" show="short"/> to achieve improved occluded object restoration, though it still falls short of ground-truth results (<ref labelref="LABEL:fig::train_viz" show="creftype~refnum"/>).</p>
      </para>
      <para xml:id="S5.SS4.p3">
        <p>For test visualization, detailed results corresponding to <ref labelref="LABEL:fig:explain" show="creftype~refnum"/> are showcased in <ref labelref="LABEL:fig:test_viz" show="creftype~refnum"/>. The predictive setting demonstrates <glossaryref inlist="acronym" key="method" show="short"/>’s accurate reconstruction of observed objects. In the hypothetical setting, <glossaryref inlist="acronym" key="method" show="short"/> provides coherent explanations involving hidden object interactions. In the explicative setting, the occluder is lifted toward the end of the videos, resulting in surprising outcomes.
<!--  %**** main.tex Line 250 **** --></p>
      </para>
      <para xml:id="S5.SS4.p4">
        <p>To conclude, <glossaryref inlist="acronym" key="method" show="short"/> proficiently reconstructs occluded objects and provides visual explanations for various events, underscoring its capacity to reason about hidden factors in the context of intuitive physics.</p>
      </para>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S6">
    <tags>
      <tag>6</tag>
      <tag role="autoref">section 6</tag>
      <tag role="refnum">6</tag>
      <tag role="typerefnum">§6</tag>
    </tags>
    <title><tag close=" ">6</tag>Conclusion and discussion</title>
    <para xml:id="S6.p1">
      <p>In this paper, we introduced <text font="typewriter">X-VoE</text>, a novel explanation-based  <glossaryref inlist="acronym" key="voe" show="long"/> (<glossaryref inlist="acronym" key="voe" show="short"/>) dataset consisting of four distinct scenarios, each encompassing three unique settings: predictive, hypothetical, and explicative. While the predictive setting aligns with conventional <glossaryref inlist="acronym" key="voe" show="short"/> tasks, the other two settings focus on evaluating a model’s explanatory capacity. Our proposed <glossaryref inlist="acronym" key="method" show="short"/> combines reasoning and explanation processes to address occluded objects, offering enhanced performance within the <text font="typewriter">X-VoE</text> settings. Our experiments revealed that <glossaryref inlist="acronym" key="method" show="short"/> excels in scenarios requiring explicit explanations for occluded objects, positioning it ahead of other methodologies. Notably, the decoded representation from <glossaryref inlist="acronym" key="method" show="short"/> offers visual explanations for occluded events, highlighting its ability to reason about hidden factors.</p>
    </para>
    <para xml:id="S6.p2">
      <p>Our work underscores the pivotal role of explanations in <glossaryref inlist="acronym" key="voe" show="short"/> tasks, particularly concerning occluded objects and their contribution to video comprehension. Even when objects are obscured by walls, the possibility of underlying physical events remains, and a model equipped with explanation capabilities performs more adeptly in such situations. The capacity to reason about occluded objects extends the model’s scope beyond mere video prediction, enabling it to capture intuitive physics principles more effectively.</p>
    </para>
    <para xml:id="S6.p3">
      <p>However, certain challenges persist. Notably, <glossaryref inlist="acronym" key="method" show="short"/> encounters difficulties in scenarios that demand high-level explanations, such as the hypothetical setting in collision or continuity (<ref labelref="LABEL:fig:comparison" show="creftype~refnum"/>). These limitations underscore the need for further advancements in the reasoning aspect of our model, paving the way for future research. The ability to handle complex interactions and provide meaningful explanations remains a challenging aspect that requires careful consideration in model design.</p>
    </para>
    <para xml:id="S6.p4">
      <p>In conclusion, while our model’s reasoning capabilities are still a work in progress, our study sheds light on the integration of explanations into <glossaryref inlist="acronym" key="voe" show="short"/> tasks, aiming to develop models with a level of intuitive physics comprehension akin to infants. The focus on occluded objects and their explanatory potential broadens the scope of <glossaryref inlist="acronym" key="voe" show="short"/> tasks and encourages the development of AI systems with deeper understanding.</p>
    </para>
    <subsection inlist="toc" xml:id="S6.SS1">
      <tags>
        <tag>6.1</tag>
        <tag role="autoref">subsection 6.1</tag>
        <tag role="refnum">6.1</tag>
        <tag role="typerefnum">§6.1</tag>
      </tags>
      <title><tag close=" ">6.1</tag>Limitations</title>
      <paragraph inlist="toc" xml:id="S6.SS1.SSS0.Px1">
        <title>Method</title>
        <para xml:id="S6.SS1.SSS0.Px1.p1">
          <p>Despite its strengths, <glossaryref inlist="acronym" key="method" show="short"/> faces certain limitations. It struggles in some experiments, particularly the hypothetical setting in collision or continuity (<ref labelref="LABEL:fig:comparison" show="creftype~refnum"/>), where its performance falls short of human-like comprehension. Furthermore, our explanation process employs a basic Transformer module, lacking physics-related inductive biases that could enhance performance. A promising direction for future research lies in incorporating domain-specific inductive biases that exploit physical principles to improve reasoning and explanatory capabilities.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S6.SS1.SSS0.Px2">
        <title>Accuracy metric</title>
        <para xml:id="S6.SS1.SSS0.Px2.p1">
          <p>Although our accuracy metrics draw inspiration from developmental psychology experiments and prior works, they rely on video comparisons to evaluate violations of intuitive physics. This approach, while effective, assumes that one of the videos violates intuitive physics laws, even if the difference in surprise values is marginal. As a result, the method might struggle to achieve the desired metrics in scenarios like the hypothetical setting. Exploring metrics that focus on higher-level concepts and the detection of fundamental violations could yield insights into the underlying mechanisms that drive these evaluations.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S6.SS1.SSS0.Px3">
        <title>Dataset</title>
        <para xml:id="S6.SS1.SSS0.Px3.p1">
          <p><text font="typewriter">X-VoE</text> pioneers the evaluation of physical explanatory abilities in <glossaryref inlist="acronym" key="voe" show="short"/> tasks. However, our test scenarios could be more diverse and comprehensive. Future efforts will expand and diversify these scenarios to create a more robust framework for testing intuitive physics understanding in <glossaryref inlist="acronym" key="voe" show="short"/>. By incorporating a wider range of physical phenomena and interactions, future datasets can challenge AI systems with greater complexity.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="S6.SS2">
      <tags>
        <tag>6.2</tag>
        <tag role="autoref">subsection 6.2</tag>
        <tag role="refnum">6.2</tag>
        <tag role="typerefnum">§6.2</tag>
      </tags>
      <title><tag close=" ">6.2</tag>Future Directions</title>
      <para xml:id="S6.SS2.p1">
        <p>Future research should focus on refining <glossaryref inlist="acronym" key="method" show="short"/>’s reasoning capabilities, enhancing its performance in scenarios demanding higher-order explanations. Introducing more sophisticated physics-based inductive biases could contribute to better occluded object reasoning. Additionally, exploring hybrid approaches that combine neural networks with symbolic reasoning could lead to more advanced models with enhanced explanatory capabilities.</p>
      </para>
<!--  %**** main.tex Line 275 **** -->      <para xml:id="S6.SS2.p2">
        <p>Additionally, <text font="typewriter">X-VoE</text> can serve as a stepping stone for designing more intricate and varied <glossaryref inlist="acronym" key="voe" show="short"/> scenarios. Incorporating more complex physical interactions, occlusions, and multiple objects would lead to a richer and more challenging testbed for evaluating AI systems’ intuitive physics comprehension. Diverse scenarios can provide comprehensive evaluation of models’ understanding across a wide range of intuitive physics principles.</p>
      </para>
      <para xml:id="S6.SS2.p3">
        <p>In summary, our study provides insights into the integration of explanations in <glossaryref inlist="acronym" key="voe" show="short"/> tasks and sets the stage for future advancements in both model design and dataset development. The intersection of explanations and intuitive physics comprehension holds promise for creating AI systems that not only predict events but also understand the underlying physical principles that govern them.</p>
      </para>
    </subsection>
  </section>
  <section xml:id="Sx1">
    <title>Acknowledgment</title>
    <para xml:id="Sx1.p1">
      <p>The authors would like to thank four anonymous reviews for constructive feedback, Huiyin Li (BIGAI) for designing the figures, and NVIDIA for their generous support of GPUs and hardware. This work is supported in part by the National Key R&amp;D Program of China (2022ZD0114900) and the Beijing Nova Program.</p>
    </para>
  </section>
  <bibliography bibstyle="config/ieee˙fullname" citestyle="numbers" files="reference_header,reference" xml:id="bib">
    <title fontsize="90%">References</title>
  </bibliography>
  <pagination role="newpage"/>
<!--  %**** main.tex Line 300 **** -->  <appendix inlist="toc" xml:id="A1">
    <tags>
      <tag>Appendix A</tag>
      <tag role="autoref">Appendix A</tag>
      <tag role="refnum">A</tag>
      <tag role="typerefnum">Appendix A</tag>
    </tags>
    <title><tag close=" ">Appendix A</tag>Dataset</title>
    <toctitle><tag close=" ">A</tag>Dataset</toctitle>
    <subsection inlist="toc" xml:id="A1.SS1">
      <tags>
        <tag>A.1</tag>
        <tag role="autoref">subsection A.1</tag>
        <tag role="refnum">A.1</tag>
        <tag role="typerefnum">§A.1</tag>
      </tags>
      <title><tag close=" ">A.1</tag>Test data</title>
      <para xml:id="A1.SS1.p1">
        <p>For the VoE task, we divided the four scenarios into 11 groups, each with two comparison cases. The setups in the testing data are very similar to the ones in the training data except for the behavior of the wall. All scenarios except Permanence contain predictive, hypothetical, and explicative settings. The predictive and explicative settings contain both plausible and implausible events, while the hypothetical setting contains two plausible events. In the predictive setting, the wall is moved away at the beginning and end of the video, so all information is shown at the beginning and end of the video. In the hypothetical setting, the wall always stays in the middle of the scene. In the explicative setting, the wall is moved away only at the end of the video, so new information is shown to the model at the end of the video.</p>
      </para>
      <paragraph inlist="toc" xml:id="A1.SS1.SSS0.Px1">
        <title>Collision</title>
        <para xml:id="A1.SS1.SSS0.Px1.p1">
          <p>The Collision scenario is shown in <ref labelref="LABEL:fig_supp:test_collision" show="creftype~refnum"/>. Collision contains predictive, hypothetical, and explicative settings. In the predictive setting, the wall is moved away at the beginning and end of the video, so two balls are visible to the model. We can easily tell from intuitive physics that the case in the first row is possible while the case in the second row is not, because the red ball cannot pass through the blue ball without collision. In the hypothetical setting, the wall always stays in the middle of the scene, so we can not tell how many balls there are in the scene. As we can not infer if a blue ball is hidden behind the wall at the beginning of the video, both cases in the setting are possible. In the explicative setting, the wall is moved away at the end of the video, so additional information is given. We can infer that a blue ball must be hidden behind the wall, so the case in the first row is possible, while the case in the second row is not.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS1.SSS0.Px2">
        <title>Blocking</title>
        <para xml:id="A1.SS1.SSS0.Px2.p1">
          <p>The Blocking scenario is shown in <ref labelref="LABEL:fig_supp:test_block" show="creftype~refnum"/>. The Blocking scenarios are similar to the Collision scenarios, except that the ball hidden behind the wall is replaced by a fixed cube. In the predictive setting, the wall is moved away at the beginning and end of the video, so the cube is visible to the model. Similar to Collision, we can easily tell that the case in the first row is possible while the case in the second row is not, because the blue ball can not pass through the green cube without collision. In the hypothetical setting, the wall always stays in the middle of the scene, so we can not tell if there is a cube behind the wall. Therefore, both cases in the setting are possible. In the explicative setting, the wall is moved away at the end of the video, so we can infer that a cube must be hidden behind the wall. Furthermore, we can tell that the case in the first row is possible while the case in the second row is not.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS1.SSS0.Px3">
        <title>Permanence</title>
        <para xml:id="A1.SS1.SSS0.Px3.p1">
          <p>The Permanence scenario is shown in <ref labelref="LABEL:fig_supp:test_permanence" show="creftype~refnum"/>. In the Permanence scenarios, three cubes are randomly divided into two groups (allowing empty groups), where cubes in the first group are dropped to the ground and the second rest on the floor. We do not have an explicative setting for this scenario, as there is no new evidence at the end of the video. In the predictive setting, the wall is moved away at the beginning of the video, so we can infer that there is no object on the ground at the beginning. So the case in the second row is impossible, while the case in the first row is possible. In the hypothetical setting, the wall stays in the middle of the scene at the beginning, so we can not tell if there are cubes on the ground at the beginning, so both cases are possible.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS1.SSS0.Px4">
        <title>Continuity</title>
        <para xml:id="A1.SS1.SSS0.Px4.p1">
          <p>The Continuity scenario is shown in <ref labelref="LABEL:fig_supp:test_continuity" show="creftype~refnum"/>. In the Continuity scenarios, we create a window on the lower half of the wall. In the case of the wall, the ball rolls across the scene. When the ball passes through the wall, it can be seen going from one side to the other. In the predictive setting, the wall is moved away at the beginning of the video, so we can infer that only one ball is in the scene. We can tell that the case in the second row is impossible while the case in the first row is possible. In the hypothetical setting, the wall always stays in the middle of the scene, and we can easily infer that the case in the first row is possible. Considering the case in the second row, we can not tell if there are two balls with the same appearance in the scene, one of which is visible at the beginning and the other one is hidden by the right part of the wall. If that is true, the case in the second row is also possible. So both cases are possible. In the explicative setting, the wall is moved away at the end of the video, so we can infer that there is only one ball in the scene. Thus we can tell that the case in the first row is possible while the case in the second row is not.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec_sup:train_data" xml:id="A1.SS2">
      <tags>
        <tag>A.2</tag>
        <tag role="autoref">subsection A.2</tag>
        <tag role="refnum">A.2</tag>
        <tag role="typerefnum">§A.2</tag>
      </tags>
      <title><tag close=" ">A.2</tag>Train data</title>
<!--  %**** main.tex Line 325 **** -->      <para xml:id="A1.SS2.p1">
        <p>For four scenarios, we created 5 groups for training. Each of Permanence and Continuity contains 1 group, while Collision and Blocking in total contain 3 groups. Each group contains 2 kinds of cases: cases with a wall and ones without a wall. In the case with a wall, a movable wall stands in the middle of the scene and will be moved away at the beginning and the end of the video. In the case without the wall, everything stays the same except that the wall does not exist, showing that the wall won’t interact with other objects physically. Each row in the <ref labelref="LABEL:fig:train" show="creftype~refnum"/> corresponds to one sampled video in a specific case. See <ref labelref="LABEL:fig:train" show="creftype~refnum"/> for all training groups.</p>
      </para>
      <paragraph inlist="toc" xml:id="A1.SS2.SSS0.Px1">
        <title>Control group</title>
        <para xml:id="A1.SS2.SSS0.Px1.p1">
          <p>In the control group, a ball rolls across the scene without interacting with other objects, indicating that the environment follows basic physics.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS2.SSS0.Px2">
        <title>Collision group</title>
        <para xml:id="A1.SS2.SSS0.Px2.p1">
          <p>A ball rolls across the scene in the Collision scenario with the wall. Another ball with the same mass but a different color is hidden behind the wall and will collide with the incoming ball, causing the first ball to stop and itself to pass through. In a setting without a wall, the second ball will always be visible.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS2.SSS0.Px3">
        <title>Blocking group</title>
        <para xml:id="A1.SS2.SSS0.Px3.p1">
          <p>The Blocking scenarios are similar to the Collision scenario, except that the ball hidden behind the wall is replaced by a fixed cube. A ball rolls across the scene in the blocking setting with the wall. A fixed cube is hidden behind the wall and will collide with the incoming ball, causing the incoming ball to turn around. In the setting without a wall, everything stays the same except that the wall doesn’t exist, and the cube will always be visible.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS2.SSS0.Px4">
        <title>Permanence group</title>
        <para xml:id="A1.SS2.SSS0.Px4.p1">
          <p>In the Permanence scenario, three cubes are randomly divided into two groups (allowing empty groups), where cubes in the first group are dropped to the ground and the second rest on the floor. In the setting with the wall, the wall will be moved away at the end of the video, showing that all of the cubes still exist. In the setting without the wall, the cubes will always be visible.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A1.SS2.SSS0.Px5">
        <title>Continuity group</title>
        <para xml:id="A1.SS2.SSS0.Px5.p1">
          <p>In the Continuity scenario, we create a window on the lower half of the wall. In the setting with the wall, the ball rolls across the scene. When the ball passes through the wall, it can be seen going from one side to the other, especially visible from the window. In the setting without the wall, the ball will always be visible.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="A1.SS3">
      <tags>
        <tag>A.3</tag>
        <tag role="autoref">subsection A.3</tag>
        <tag role="refnum">A.3</tag>
        <tag role="typerefnum">§A.3</tag>
      </tags>
      <title><tag close=" ">A.3</tag>Environment</title>
      <para xml:id="A1.SS3.p1">
        <p>Our <text font="typewriter">X-VoE</text> dataset comprises 22K+100K procedurally generated scenes using Unreal Engine 4. In addition to the floors and the backgrounds, there are four different object types: balls, cubes, walls, and windowed walls. In all videos, the size of the ball and the cube are the same, while the size of the wall with or without windows are randomly different. The positions of objects are randomly set in the videos, except for the walls in the permanent scenes in which the wall is placed in the middle. All objects, including the floor and the background, are randomly set in different colors.
<!--  %**** main.tex Line 350 **** --></p>
      </para>
      <table inlist="lot" labels="LABEL:tab_supp:decoder" placement="ht!" xml:id="A1.T1">
        <tags>
          <tag><text fontsize="90%">Table A1</text></tag>
          <tag role="autoref"><text fontsize="90%">Table A1</text></tag>
          <tag role="refnum"><text fontsize="90%">A1</text></tag>
          <tag role="typerefnum"><text fontsize="90%">Table A1</text></tag>
        </tags>
        <toccaption class="ltx_centering"><tag close=" "><text fontsize="90%">A1</text></tag><text fontsize="90%">Spatial broadcast decoder architecture (from top to down).</text></toccaption>
        <caption class="ltx_centering" fontsize="90%"><tag close=": ">Table A1</tag>Spatial broadcast decoder architecture (from top to down).</caption>
        <tabular class="ltx_centering" vattach="middle">
          <tr>
            <td align="left" border="tt"><text fontsize="90%">Type</text></td>
            <td align="left" border="tt"><text fontsize="90%">Size</text></td>
            <td align="left" border="tt"><text fontsize="90%">Activation</text></td>
            <td align="left" border="tt"><text fontsize="90%">Comment</text></td>
          </tr>
          <tr>
            <td align="left" border="t"><text fontsize="90%">Spatial Broadcast</text></td>
            <td align="left" border="t"><text fontsize="90%">8 × 8</text></td>
            <td align="left" border="t"><text fontsize="90%">-</text></td>
            <td align="left" border="t"><text fontsize="90%">-</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Position Embedding</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Conv 5 × 5</text></td>
            <td align="left"><text fontsize="90%">64</text></td>
            <td align="left"><text fontsize="90%">ReLU</text></td>
            <td align="left"><text fontsize="90%">stride: 2</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Conv 5 × 5</text></td>
            <td align="left"><text fontsize="90%">64</text></td>
            <td align="left"><text fontsize="90%">ReLU</text></td>
            <td align="left"><text fontsize="90%">stride: 2</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Conv 5 × 5</text></td>
            <td align="left"><text fontsize="90%">64</text></td>
            <td align="left"><text fontsize="90%">ReLU</text></td>
            <td align="left"><text fontsize="90%">stride: 2</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Conv 5 × 5</text></td>
            <td align="left"><text fontsize="90%">64</text></td>
            <td align="left"><text fontsize="90%">ReLU</text></td>
            <td align="left"><text fontsize="90%">stride: 2</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Conv 5 × 5</text></td>
            <td align="left"><text fontsize="90%">64</text></td>
            <td align="left"><text fontsize="90%">ReLU</text></td>
            <td align="left"><text fontsize="90%">stride: 1</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Conv 3 × 3</text></td>
            <td align="left"><text fontsize="90%">4</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">stride: 1</text></td>
          </tr>
          <tr>
            <td align="left" border="bb"><text fontsize="90%">Channels</text></td>
            <td align="left" border="bb"><text fontsize="90%">RGBD(4)</text></td>
            <td align="left" border="bb"><text fontsize="90%">Softmax (on depth channel)</text></td>
            <td align="left" border="bb"><text fontsize="90%">softmax(depth × abs(</text><Math mode="inline" tex="\theta" text="theta" xml:id="A1.T1.m1">
                <XMath>
                  <XMTok font="italic" fontsize="90%" name="theta" role="UNKNOWN">θ</XMTok>
                </XMath>
              </Math><text fontsize="90%">) × -1000.0)</text></td>
          </tr>
        </tabular>
      </table>
      <table inlist="lot" labels="LABEL:tab_supp:fast_reasoning" placement="ht!" xml:id="A1.T2">
        <tags>
          <tag><text fontsize="90%">Table A2</text></tag>
          <tag role="autoref"><text fontsize="90%">Table A2</text></tag>
          <tag role="refnum"><text fontsize="90%">A2</text></tag>
          <tag role="typerefnum"><text fontsize="90%">Table A2</text></tag>
        </tags>
<!--  %**** main.tex Line 375 **** -->        <toccaption class="ltx_centering"><tag close=" "><text fontsize="90%">A2</text></tag><text fontsize="90%">The Transformer architecture (from top to down). The [M] is a learnable mask token for Transformer.</text></toccaption>
        <caption class="ltx_centering" fontsize="90%"><tag close=": ">Table A2</tag>The Transformer architecture (from top to down). The [M] is a learnable mask token for Transformer.</caption>
        <tabular class="ltx_centering" vattach="middle">
          <tr>
            <td align="left" border="tt"><text fontsize="90%">Type</text></td>
            <td align="left" border="tt"><text fontsize="90%">Size</text></td>
            <td align="left" border="tt"><text fontsize="90%">Activation</text></td>
            <td align="left" border="tt"><text fontsize="90%">Comment</text></td>
          </tr>
          <tr>
            <td align="left" border="t"><text fontsize="90%">LP (1)</text></td>
            <td align="left" border="t"><text fontsize="90%">256</text></td>
            <td align="left" border="t"><text fontsize="90%">-</text></td>
            <td align="left" border="t"><text fontsize="90%">-</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Mask (1)</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">× mask + [M] × (1-mask)</text></td>
            <td align="left"><text fontsize="90%">mask : (size F × N × 1), (value 0 or 1)</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Position Embedding</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">Transformer</text></td>
            <td align="left"><text fontsize="90%">256, 256 (MLP)</text></td>
            <td align="left"><text fontsize="90%">ReLU (MLP)</text></td>
            <td align="left"><text fontsize="90%">head=8,key=32,layers=6</text></td>
          </tr>
          <tr>
            <td align="left"><text fontsize="90%">LP (2)</text></td>
            <td align="left"><text fontsize="90%">256</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
            <td align="left"><text fontsize="90%">-</text></td>
          </tr>
          <tr>
            <td align="left" border="bb"><text fontsize="90%">Mask (2)</text></td>
            <td align="left" border="bb"><text fontsize="90%">-</text></td>
            <td align="left" border="bb"><text fontsize="90%">× (1-mask) + inputs × mask</text></td>
            <td align="left" border="bb"><text fontsize="90%">mask : (size F × N × 1), (value 0 or 1)</text></td>
          </tr>
        </tabular>
      </table>
    </subsection>
  </appendix>
  <appendix inlist="toc" xml:id="A2">
    <tags>
      <tag>Appendix B</tag>
      <tag role="autoref">Appendix B</tag>
      <tag role="refnum">B</tag>
      <tag role="typerefnum">Appendix B</tag>
    </tags>
    <title><tag close=" ">Appendix B</tag>Model</title>
    <toctitle><tag close=" ">B</tag>Model</toctitle>
    <subsection inlist="toc" xml:id="A2.SS1">
      <tags>
        <tag>B.1</tag>
        <tag role="autoref">subsection B.1</tag>
        <tag role="refnum">B.1</tag>
        <tag role="typerefnum">§B.1</tag>
      </tags>
      <title><tag close=" ">B.1</tag>Perception</title>
      <para xml:id="A2.SS1.p1">
        <p>The perception module in <glossaryref inlist="acronym" key="method" show="short"/> is similar to that of Component Variational Autoencoder (ComponentVAE) in the PLATO model <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite>. For each object <Math mode="inline" tex="k" text="k" xml:id="A2.SS1.p1.m1">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">k</XMTok>
            </XMath>
          </Math> in an image, we take as input a 128 × 128 RGBD (0-255 for each channel) image <Math mode="inline" tex="x_{k}" text="x _ k" xml:id="A2.SS1.p1.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">x</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
              </XMApp>
            </XMath>
          </Math> that is masked except around the object. Then we use a Vision Transformer <cite class="ltx_citemacro_cite">[<bibref bibrefs="dosovitskiy2020image" separator="," yyseparator=","/>]</cite> encoder <Math mode="inline" tex="\Phi" text="Phi" xml:id="A2.SS1.p1.m3">
            <XMath>
              <XMTok name="Phi" role="UNKNOWN">Φ</XMTok>
            </XMath>
          </Math> to encode the image with only one object into a 32-dimensional Gaussian posterior distribution <Math mode="inline" tex="q_{\Phi}(z_{k}|x_{k})" text="q _ Phi * conditional@(z _ k, x _ k)" xml:id="A2.SS1.p1.m4">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">q</XMTok>
                  <XMTok fontsize="70%" name="Phi" role="UNKNOWN">Φ</XMTok>
                </XMApp>
                <XMDual>
                  <XMRef idref="A2.SS1.p1.m4.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="A2.SS1.p1.m4.1">
                      <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">x</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>. The sample from this distribution, <Math mode="inline" tex="z_{k}" text="z _ k" xml:id="A2.SS1.p1.m5">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">z</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
              </XMApp>
            </XMath>
          </Math>, is decoded by a spatial broadcast decoder <cite class="ltx_citemacro_cite">[<bibref bibrefs="watters2019spatial" separator="," yyseparator=","/>]</cite> to an RGBD image. To address occlusion, we use the depth of the decoder image to combine all objects in the image by multiplying them with softmaxed depth values. We first pretrained the perception module by optimizing the variational objective defined in <cite class="ltx_citemacro_cite">[<bibref bibrefs="burgess2019monet" separator="," yyseparator=","/>]</cite>. We set <Math mode="inline" tex="\sigma" text="sigma" xml:id="A2.SS1.p1.m6">
            <XMath>
              <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
            </XMath>
          </Math> to 0.05, <Math mode="inline" tex="\beta" text="beta" xml:id="A2.SS1.p1.m7">
            <XMath>
              <XMTok font="italic" name="beta" role="UNKNOWN">β</XMTok>
            </XMath>
          </Math> to 0.5, and <Math mode="inline" tex="\gamma" text="gamma" xml:id="A2.SS1.p1.m8">
            <XMath>
              <XMTok font="italic" name="gamma" role="UNKNOWN">γ</XMTok>
            </XMath>
          </Math> to 0 to ensure that the model reconstructs object masks without segmentation information in the loss function.</p>
      </para>
      <paragraph inlist="toc" xml:id="A2.SS1.SSS0.Px1">
        <title>ViT encoder</title>
<!--  %**** main.tex Line 400 **** -->        <para xml:id="A2.SS1.SSS0.Px1.p1">
          <p>We first reshape the 128 × 128 × 4 images into a sequence of flattened 16 × 16 × 256 patches, followed by a linear layer with 256 dimensions. Next, we add 2D position embeddings and learnable embeddings, flatten, and send them to a Transformer. We use 8 multi-head, 32 key dimensions, 1024 MLP layer dimensions, and 6 Transformer layers for the Transformer model <cite class="ltx_citemacro_cite">[<bibref bibrefs="vaswani2017attention" separator="," yyseparator=","/>]</cite>. Finally, we use an MLP layer with size [512, 64] and a leaky-ReLU activation function to the Transformer output and obtain 32-dimensional Gaussian posterior distributions for each object.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A2.SS1.SSS0.Px2">
        <title>Spatial broadcast decoder</title>
        <para xml:id="A2.SS1.SSS0.Px2.p1">
          <p>Our spatial broadcast decoder is similar to that in <cite class="ltx_citemacro_cite">[<bibref bibrefs="locatello2020object" separator="," yyseparator=","/>]</cite>. As shown in <ref labelref="LABEL:tab_supp:decoder" show="creftype~refnum"/>, we use position embeddings and CNN model to decode the object embeddings, where the parameter <Math mode="inline" tex="\theta" text="theta" xml:id="A2.SS1.SSS0.Px2.p1.m1">
              <XMath>
                <XMTok font="italic" name="theta" role="UNKNOWN">θ</XMTok>
              </XMath>
            </Math> in the softmax layer is learnable, thus representing the mask in terms of depth.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="A2.SS2">
      <tags>
        <tag>B.2</tag>
        <tag role="autoref">subsection B.2</tag>
        <tag role="refnum">B.2</tag>
        <tag role="typerefnum">§B.2</tag>
      </tags>
      <title><tag close=" ">B.2</tag>Reasoning</title>
      <para xml:id="A2.SS2.p1">
        <p>In the reasoning module, we use two Transformer modules to reason the hidden object which is occluded in some or all of the frames. All objects in a video can be reshaped as F × N × D embeddings, where F is 15 frames, N is 8 objects, and D is 32 dimensions in our work. As shown in <ref labelref="LABEL:tab_supp:fast_reasoning" show="creftype~refnum"/>, we use a Transformer model to reason the masked objects in video, similar to the self-supervised learning module in Aloe <cite class="ltx_citemacro_cite">[<bibref bibrefs="ding2021attention" separator="," yyseparator=","/>]</cite>; the parameter [M] in the Mask (1) part is learnable.</p>
      </para>
      <paragraph inlist="toc" xml:id="A2.SS2.SSS0.Px1">
        <title>First Transformer</title>
        <para xml:id="A2.SS2.SSS0.Px1.p1">
          <p>We set the mask to 0 for objects that are temporally occluded in some frames, and 1 for others. As shown in <ref labelref="LABEL:tab_supp:fast_reasoning" show="creftype~refnum"/>, we can use the Transformer model to reason the new object embeddings whose mask equals 0. We use it in both the training and testing steps to have better object embedding for the whole video.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="A2.SS2.SSS0.Px2">
        <title>Second Transformer</title>
        <para xml:id="A2.SS2.SSS0.Px2.p1">
          <p>In our test dataset, there may be cases where an object is obscured in all frames. So in the training step, we set the mask to 0 for one random object (including empty object) in all frames. Then we can train the second Transformer model in a self-supervised manner. In the test step, we set the mask to 0 for one object that is not visible in all frames. Then we can reason about the occluded object to explain the whole video.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="A2.SS3">
      <tags>
        <tag>B.3</tag>
        <tag role="autoref">subsection B.3</tag>
        <tag role="refnum">B.3</tag>
        <tag role="typerefnum">§B.3</tag>
      </tags>
      <title><tag close=" ">B.3</tag>Dynamics</title>
      <para xml:id="A2.SS3.p1">
        <p>In fact, the occluded objects are never directly seen for the Transformer model. After the first reasoning module, we obtain reasonable video object embeddings based on experience. In the dynamics module, we predict the value of the incremental change of the object embeddings in the time step by using the same dynamics module from PLATO <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite> with the only difference in object dimension used (from 16 to 32). We refer the readers to <cite class="ltx_citemacro_cite">[<bibref bibrefs="piloto2022intuitive" separator="," yyseparator=","/>]</cite> for architectural details.</p>
      </para>
      <table inlist="lot" labels="LABEL:tab_supp:trainpara" placement="ht!" xml:id="A2.T3">
        <tags>
          <tag><text fontsize="90%">Table A3</text></tag>
          <tag role="autoref">Table A3</tag>
          <tag role="refnum">A3</tag>
          <tag role="typerefnum">Table A3</tag>
        </tags>
        <toccaption class="ltx_centering"><tag close=" ">A3</tag>Training parameters. The pre-processed video features are calculated by the Perception module, which is pre-trained.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Table A3</text></tag><text fontsize="90%">Training parameters. The pre-processed video features are calculated by the Perception module, which is pre-trained.</text></caption>
        <block align="center" depth="0.0pt" width="433.6pt">
          <tabular vattach="middle">
            <tr>
              <td align="left" border="tt">Model</td>
              <td align="left" border="tt">batch size</td>
              <td align="left" border="tt">training step</td>
              <td align="left" border="tt">optimizer</td>
              <td align="left" border="tt">learning rate</td>
              <td align="left" border="tt">warm step</td>
              <td align="left" border="tt">delay step</td>
            </tr>
            <tr>
              <td align="left" border="t">Perception module (in <glossaryref inlist="acronym" key="method" show="short"/>,PLATO)</td>
              <td align="left" border="t">300 (images)</td>
              <td align="left" border="t">472000</td>
              <td align="left" border="t">Adam</td>
              <td align="left" border="t">0.0004</td>
              <td align="left" border="t">2000</td>
              <td align="left" border="t">100000</td>
            </tr>
            <tr>
              <td align="left"><glossaryref inlist="acronym" key="method" show="short"/></td>
              <td align="left">500 (pre-processed video features)</td>
              <td align="left">32000</td>
              <td align="left">Adam</td>
              <td align="left">0.0004</td>
              <td align="left">1000</td>
              <td align="left">10000</td>
            </tr>
            <tr>
              <td align="left">PLATO</td>
              <td align="left">500 (pre-processed video features)</td>
              <td align="left">32000</td>
              <td align="left">Adam</td>
              <td align="left">0.0004</td>
              <td align="left">1000</td>
              <td align="left">10000</td>
            </tr>
            <tr>
              <td align="left" border="bb">PhyDNet</td>
              <td align="left" border="bb">100 (videos)</td>
              <td align="left" border="bb">70000</td>
              <td align="left" border="bb">Adam</td>
              <td align="left" border="bb">0.001</td>
              <td align="left" border="bb">-</td>
              <td align="left" border="bb">-</td>
            </tr>
          </tabular>
        </block>
      </table>
    </subsection>
  </appendix>
  <appendix inlist="toc" xml:id="A3">
    <tags>
      <tag>Appendix C</tag>
      <tag role="autoref">Appendix C</tag>
      <tag role="refnum">C</tag>
      <tag role="typerefnum">Appendix C</tag>
    </tags>
    <title><tag close=" ">Appendix C</tag>Training</title>
    <toctitle><tag close=" ">C</tag>Training</toctitle>
    <subsection inlist="toc" xml:id="A3.SS1">
      <tags>
        <tag>C.1</tag>
        <tag role="autoref">subsection C.1</tag>
        <tag role="refnum">C.1</tag>
        <tag role="typerefnum">§C.1</tag>
      </tags>
      <title><tag close=" ">C.1</tag>Training detail</title>
      <para xml:id="A3.SS1.p1">
        <p>In a scene with occlusion, we cannot get the representation of the occluded object directly by observation. Therefore, we first use the dynamics loss on the object embeddings after the first Transformer to train our first Transformer and dynamics model. Then, we use the object embeddings after the first Transformer to train our second Transformer model. We randomly mask an object throughout the video frame and use the model to predict representations of the objects throughout the video, enabling the model to infer whether there is a fully hidden object in the test task.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="A3.SS2">
      <tags>
        <tag>C.2</tag>
        <tag role="autoref">subsection C.2</tag>
        <tag role="refnum">C.2</tag>
        <tag role="typerefnum">§C.2</tag>
      </tags>
      <title><tag close=" ">C.2</tag>Training parameters</title>
      <para xml:id="A3.SS2.p1">
        <p>We first pre-train the perception module and use it for both PLATO and <glossaryref inlist="acronym" key="method" show="short"/>. Then we train our model <glossaryref inlist="acronym" key="method" show="short"/>, PLATO, and PhyDNet with the parameters shown in <ref labelref="LABEL:tab_supp:trainpara" show="creftype~refnum"/>.</p>
      </para>
<!--  %**** main.tex Line 450 **** -->    </subsection>
    <subsection inlist="toc" xml:id="A3.SS3">
      <tags>
        <tag>C.3</tag>
        <tag role="autoref">subsection C.3</tag>
        <tag role="refnum">C.3</tag>
        <tag role="typerefnum">§C.3</tag>
      </tags>
      <title><tag close=" ">C.3</tag>Training steps</title>
      <para xml:id="A3.SS3.p1">
        <p>During the development of the model, we explored how the size of the training dataset impacted the pixel loss of the dynamics module. We use the expected video in the predictive setting of all scenarios as the test dataset to calculate the average pixel loss. <ref labelref="LABEL:fig_supp:loss_wrt_size" show="creftype~refnum"/> shows that more training data will improve the performance of the dynamics module.</p>
      </para>
    </subsection>
  </appendix>
  <appendix inlist="toc" xml:id="A4">
    <tags>
      <tag>Appendix D</tag>
      <tag role="autoref">Appendix D</tag>
      <tag role="refnum">D</tag>
      <tag role="typerefnum">Appendix D</tag>
    </tags>
    <title><tag close=" ">Appendix D</tag>Visualize supplementary</title>
    <toctitle><tag close=" ">D</tag>Visualize supplementary</toctitle>
    <para xml:id="A4.p1">
      <p>In the main text, we visualize the reasoning results by our <glossaryref inlist="acronym" key="method" show="short"/> model in the Blocking scenario. Here, we visualize the reasoning results for the rest of the scenarios.</p>
    </para>
    <subsection inlist="toc" xml:id="A4.SS1">
      <tags>
        <tag>D.1</tag>
        <tag role="autoref">subsection D.1</tag>
        <tag role="refnum">D.1</tag>
        <tag role="typerefnum">§D.1</tag>
      </tags>
      <title><tag close=" ">D.1</tag>Collision</title>
      <para xml:id="A4.SS1.p1">
        <p>As shown in <ref labelref="LABEL:fig_supp:collision_viz" show="creftype~refnum"/>, in the predictive setting, <glossaryref inlist="acronym" key="method" show="short"/> has no problem accurately reconstructing the objects, and the surprise video can be found directly. In the hypothetical setting, the possible explanation for the first video is that another ball collides with the incoming ball. In contrast, no such ball is in the second video, explaining both cases. This result also shows the limitation of our <glossaryref inlist="acronym" key="method" show="short"/> as the incoming ball did not stop behind the wall. In the explicative setting, the occluder is only moved away at the end of the videos. Unlike the hypothetical, when showing a hidden ball behind it, it is impossible for the ball to pass through, causing surprise.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="A4.SS2">
      <tags>
        <tag>D.2</tag>
        <tag role="autoref">subsection D.2</tag>
        <tag role="refnum">D.2</tag>
        <tag role="typerefnum">§D.2</tag>
      </tags>
      <title><tag close=" ">D.2</tag>Permanence</title>
      <para xml:id="A4.SS2.p1">
        <p>As shown in <ref labelref="LABEL:fig_supp:permanence_viz" show="creftype~refnum"/>, in the predictive setting, <glossaryref inlist="acronym" key="method" show="short"/> can reconstruct the objects behind the wall, and the surprise video can be found by comparing it with the origin image. The visual effect of the reconstructed objects does not seem to be very well, which is still a limitation of our <glossaryref inlist="acronym" key="method" show="short"/>. In the hypothetical setting, the possible explanation for the second video is that there exists another object behind the wall, and our <glossaryref inlist="acronym" key="method" show="short"/> can reason about the object.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="A4.SS3">
      <tags>
        <tag>D.3</tag>
        <tag role="autoref">subsection D.3</tag>
        <tag role="refnum">D.3</tag>
        <tag role="typerefnum">§D.3</tag>
      </tags>
      <title><tag close=" ">D.3</tag>Continuity</title>
      <para xml:id="A4.SS3.p1">
        <p>As shown in <ref labelref="LABEL:fig_supp:continuity_viz" show="creftype~refnum"/>, the visualization results of our <glossaryref inlist="acronym" key="method" show="short"/> are the same in all settings. Even though the visualization results can show surprise in predictive and explicative settings by comparing with the origin videos, our <glossaryref inlist="acronym" key="method" show="short"/> still can not deal with the hypothetical setting due to the limitation discussed in the main text. Our <glossaryref inlist="acronym" key="method" show="short"/> requires given masks and identification of objects. Therefore, it can not reason about the hypothetical setting in continuity by changing the identification of objects and suggesting that there are two same objects as infants do <cite class="ltx_citemacro_cite">[<bibref bibrefs="aguiar2002developments" separator="," yyseparator=","/>]</cite>.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig_supp:test_collision" placement="t!" xml:id="A4.F1">
        <tags>
          <tag><text fontsize="90%">Figure A1</text></tag>
          <tag role="autoref">Figure A1</tag>
          <tag role="refnum">A1</tag>
          <tag role="typerefnum">Figure A1</tag>
        </tags>
        <graphics class="ltx_centering" graphic="test_collision" options="width=433.62pt" xml:id="A4.F1.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A1</tag>Collision test groups.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A1</text></tag><text fontsize="90%">Collision test groups.</text></caption>
<!--  %**** main.tex Line 475 **** -->      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:test_block" placement="t!" xml:id="A4.F2">
        <tags>
          <tag><text fontsize="90%">Figure A2</text></tag>
          <tag role="autoref">Figure A2</tag>
          <tag role="refnum">A2</tag>
          <tag role="typerefnum">Figure A2</tag>
        </tags>
        <graphics class="ltx_centering" graphic="test_block" options="width=433.62pt" xml:id="A4.F2.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A2</tag>Blocking test groups.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A2</text></tag><text fontsize="90%">Blocking test groups.</text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:test_permanence" placement="t!" xml:id="A4.F3">
        <tags>
          <tag><text fontsize="90%">Figure A3</text></tag>
          <tag role="autoref">Figure A3</tag>
          <tag role="refnum">A3</tag>
          <tag role="typerefnum">Figure A3</tag>
        </tags>
        <graphics class="ltx_centering" graphic="test_permanence" options="width=433.62pt" xml:id="A4.F3.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A3</tag>Permanence test groups.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A3</text></tag><text fontsize="90%">Permanence test groups.</text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:test_continuity" placement="t!" xml:id="A4.F4">
        <tags>
          <tag><text fontsize="90%">Figure A4</text></tag>
          <tag role="autoref">Figure A4</tag>
          <tag role="refnum">A4</tag>
          <tag role="typerefnum">Figure A4</tag>
        </tags>
        <graphics class="ltx_centering" graphic="test_continuity" options="width=433.62pt" xml:id="A4.F4.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A4</tag>Continuity test groups.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A4</text></tag><text fontsize="90%">Continuity test groups.</text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:loss_wrt_size" placement="t!" xml:id="A4.F5">
        <tags>
          <tag><text fontsize="90%">Figure A5</text></tag>
          <tag role="autoref">Figure A5</tag>
          <tag role="refnum">A5</tag>
          <tag role="typerefnum">Figure A5</tag>
        </tags>
<!--  %**** main.tex Line 500 **** -->        <graphics class="ltx_centering" graphic="loss_wrt_size" options="width=346.896pt" xml:id="A4.F5.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A5</tag>Average pixel loss of test data for different sizes of training data.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A5</text></tag><text fontsize="90%">Average pixel loss of test data for different sizes of training data.</text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:collision_viz" placement="t!" xml:id="A4.F6">
        <tags>
          <tag><text fontsize="90%">Figure A6</text></tag>
          <tag role="autoref">Figure A6</tag>
          <tag role="refnum">A6</tag>
          <tag role="typerefnum">Figure A6</tag>
        </tags>
        <graphics class="ltx_centering" graphic="collision_viz" options="width=346.896pt" xml:id="A4.F6.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A6</tag>Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing in collision scenarios.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A6</text></tag><text fontsize="90%">Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing in collision scenarios.</text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:permanence_viz" placement="t!" xml:id="A4.F7">
        <tags>
          <tag><text fontsize="90%">Figure A7</text></tag>
          <tag role="autoref">Figure A7</tag>
          <tag role="refnum">A7</tag>
          <tag role="typerefnum">Figure A7</tag>
        </tags>
        <graphics class="ltx_centering" graphic="permanence_viz" options="width=346.896pt" xml:id="A4.F7.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A7</tag>Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing in permanence scenarios.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A7</text></tag><text fontsize="90%">Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing in permanence scenarios.</text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig_supp:continuity_viz" placement="t!" xml:id="A4.F8">
        <tags>
          <tag><text fontsize="90%">Figure A8</text></tag>
          <tag role="autoref">Figure A8</tag>
          <tag role="refnum">A8</tag>
          <tag role="typerefnum">Figure A8</tag>
        </tags>
        <graphics class="ltx_centering" graphic="continuity_viz" options="width=346.896pt" xml:id="A4.F8.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">A8</tag>Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing in continuity scenarios.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure A8</text></tag><text fontsize="90%">Visualization of the inferred internal representation in <glossaryref inlist="acronym" key="method" show="short"/> during testing in continuity scenarios.</text></caption>
      </figure>
<!--  %**** main.tex Line 525 **** -->    </subsection>
  </appendix>
</document>
