<?xml version="1.0" encoding="UTF-8"?>
<?latexml searchpaths="/home/japhy/scienceReplication.artiswrong.com/paper_files/arxiv/2209.12344/latex_extracted"?>
<?latexml class="article"?>
<!--  %if you need to pass options to natbib, use, e.g.: --><!--  %“PassOptionsToPackage–numbers, compress˝–natbib˝ --><!--  %before loading svrhm˙2022 --><!--  %ready for submission --><!--  %“usepackage–svrhm˙2022˝ --><!--  %to compile a preprint version, e.g., for submission to arXiv, add add the --><!--  %[preprint] option: --><!--  %“usepackage[preprint]–svrhm˙2022˝ --><!--  %to compile a camera-ready version, add the [final] option, e.g.: --><?latexml package="svrhm_2022" options="preprint"?>
<!--  %other imports --><?latexml package="inputenc" options="utf8"?>
<?latexml package="fontenc" options="T1"?>
<?latexml package="hyperref"?>
<?latexml package="url"?>
<?latexml package="booktabs"?>
<?latexml package="amsfonts"?>
<?latexml package="nicefrac"?>
<?latexml package="microtype"?>
<?latexml package="xcolor"?>
<!--  %%%%%␣NEW␣MATH␣DEFINITIONS␣%%%%% --><?latexml package="amsmath,amsfonts,bm"?>
<!--  %Mark␣sections␣of␣captions␣for␣referring␣to␣divisions␣of␣figures --><!--  %Highlight␣a␣newly␣defined␣term --><!--  %Figure␣reference,␣lower-case. --><!--  %Figure␣reference,␣capital.␣For␣start␣of␣sentence --><!--  %****␣math_commands.tex␣Line␣25␣**** --><!--  %Section␣reference,␣lower-case. --><!--  %Section␣reference,␣capital. --><!--  %Reference␣to␣two␣sections. --><!--  %Reference␣to␣three␣sections. --><!--  %Reference␣to␣an␣equation,␣lower-case. --><!--  %Reference␣to␣an␣equation,␣upper␣case --><!--  %A␣raw␣reference␣to␣an␣equation__avoid␣using␣if␣possible --><!--  %Reference␣to␣a␣chapter,␣lower-case. --><!--  %Reference␣to␣an␣equation,␣upper␣case. --><!--  %Reference␣to␣a␣range␣of␣chapters --><!--  %Reference␣to␣an␣algorithm,␣lower-case. --><!--  %Reference␣to␣an␣algorithm,␣upper␣case. --><!--  %****␣math_commands.tex␣Line␣50␣**** --><!--  %Reference␣to␣a␣part,␣lower␣case --><!--  %Reference␣to␣a␣part,␣upper␣case --><!--  %Random␣variables --><!--  %****␣math_commands.tex␣Line␣75␣**** --><!--  %rm␣is␣already␣a␣command,␣just␣don’t␣name␣any␣random␣variables␣m --><!--  %Random␣vectors --><!--  %****␣math_commands.tex␣Line␣100␣**** --><!--  %****␣math_commands.tex␣Line␣125␣**** --><!--  %Elements␣of␣random␣vectors --><!--  %****␣math_commands.tex␣Line␣150␣**** --><!--  %Random␣matrices --><!--  %****␣math_commands.tex␣Line␣175␣**** --><!--  %Elements␣of␣random␣matrices --><!--  %****␣math_commands.tex␣Line␣200␣**** --><!--  %Vectors --><!--  %****␣math_commands.tex␣Line␣225␣**** --><!--  %Elements␣of␣vectors --><!--  %****␣math_commands.tex␣Line␣250␣**** --><!--  %****␣math_commands.tex␣Line␣275␣**** --><!--  %Matrix --><!--  %****␣math_commands.tex␣Line␣300␣**** --><!--  %Tensor --><!--  %****␣math_commands.tex␣Line␣325␣**** --><!--  %Graph --><!--  %****␣math_commands.tex␣Line␣350␣**** --><!--  %Sets --><!--  %****␣math_commands.tex␣Line␣375␣**** --><!--  %Don’t␣use␣a␣set␣called␣E,␣because␣this␣would␣be␣the␣same␣as␣our␣symbol --><!--  %for␣expectation. --><!--  %****␣math_commands.tex␣Line␣400␣**** --><!--  %Entries␣of␣a␣matrix --><!--  %****␣math_commands.tex␣Line␣425␣**** --><!--  %entries␣of␣a␣tensor --><!--  %Same␣font␣as␣tensor,␣without␣\bm␣wrapper --><!--  %****␣math_commands.tex␣Line␣450␣**** --><!--  %The␣true␣underlying␣data␣generating␣distribution --><!--  %The␣empirical␣distribution␣defined␣by␣the␣training␣set --><!--  %The␣model␣distribution --><!--  %Stochastic␣autoencoder␣distributions --><!--  %****␣math_commands.tex␣Line␣475␣**** --><!--  %Laplace␣distribution --><!--  %Wolfram␣Mathworld␣says␣$L^2$␣is␣for␣function␣spaces␣and␣$\ell^2$␣is␣for␣vectors --><!--  %But␣then␣they␣seem␣to␣use␣$L^2$␣for␣vectors␣throughout␣the␣site,␣and␣so␣does --><!--  %wikipedia. --><!--  %****␣math_commands.tex␣Line␣500␣**** --><!--  %See␣usage␣in␣notation.tex.␣Chosen␣to␣match␣Daphne’s␣book. --><!--  %for␣formulas --><?latexml package="graphicx"?>
<?latexml package="dirtytalk"?>
<!--  %\author{ --><!--  %Luca␣M.␣Schulze␣Buschoff␣\thanks{Corresponding␣author:␣luca.schulze-buschoff@tuebingen.mpg.de} --><!--  %\And --><!--  %Eric␣Schulz --><!--  %\And --><!--  %Marcel␣Binz --><!--  %\AND --><!--  %{\normalfont --><!--  %MPRG␣Computational␣Principles␣of␣Intelligence␣\\ --><!--  %Max␣Planck␣Institute␣for␣Biological␣Cybernetics␣\\ --><!--  %Tübingen,␣Germany} --><!--  %} --><!--  %****␣svrhm_2022.tex␣Line␣50␣**** --><?latexml RelaxNGSchema="LaTeXML"?>
<document xmlns="http://dlmf.nist.gov/LaTeXML" class="ltx_authors_1line">
  <resource src="LaTeXML.css" type="text/css"/>
  <resource src="ltx-article.css" type="text/css"/>
  <title>Stochastic Gradient Descent Captures <break/>How Children Learn About Physics</title>
  <creator role="author">
    <personname>
Luca M. Schulze Buschoff 
    Eric Schulz
    Marcel Binz
<break/><break/>MPRG Computational Principles of Intelligence <break/>Max Planck Institute for Biological Cybernetics <break/>Tübingen, Germany</personname>
    <contact role="thanks">Corresponding author: luca.schulze-buschoff@tuebingen.mpg.de</contact>
  </creator>
  <abstract name="Abstract">
    <p>As children grow older, they develop an intuitive understanding of the physical processes around them. They move along developmental trajectories, which have been mapped out extensively in previous empirical research. We investigate how children’s developmental trajectories compare to the learning trajectories of artificial systems. Specifically, we examine the idea that cognitive development results from some form of stochastic optimization procedure. For this purpose, we train a modern generative neural network model using stochastic gradient descent. We then use methods from the developmental psychology literature to probe the physical understanding of this model at different degrees of optimization. We find that the model’s learning trajectory captures the developmental trajectories of children, thereby providing support to the idea of development as stochastic optimization.</p>
  </abstract>
  <section inlist="toc" xml:id="S1">
    <tags>
      <tag>1</tag>
      <tag role="autoref">section 1</tag>
      <tag role="refnum">1</tag>
      <tag role="typerefnum">§1</tag>
    </tags>
    <title><tag close=" ">1</tag>Introduction</title>
    <para xml:id="S1.p1">
      <p>More than 70 years ago, <cite class="ltx_citemacro_citet"><bibref bibrefs="turing1950computing" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
            <bibrefphrase>(</bibrefphrase>
            <bibrefphrase>)</bibrefphrase>
          </bibref></cite> famously suggested that <ERROR class="undefined">\say</ERROR>instead of trying to produce a programme to simulate the adult mind, why not rather try to produce one which simulates the child’s? If this were then subjected to an appropriate course of education one would obtain the adult brain. If we want to take Turing’s proposal seriously, we have to ask ourselves: how do children learn?</p>
    </para>
    <para xml:id="S1.p2">
      <p>The physical laws of nature are one of the earliest things that children learn <cite class="ltx_citemacro_citep">(<bibref bibrefs="spelke2007core,lake2017building" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Therefore, they can serve as an ideal testbed for investigating this question. There has already been a substantial amount of work across different research areas to understand and reproduce the human ability for physical reasoning. On the one hand, empirical work in developmental psychology has provided us with a precise understanding of the different developmental stages that children undergo during their cognitive development <cite class="ltx_citemacro_citep">(<bibref bibrefs="baillargeon1996infants,baillargeon2004infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. On the other hand, machine learning researchers have started to successfully apply tools from deep learning to build models that mimic the intuitive physical understanding of people <cite class="ltx_citemacro_citep">(<bibref bibrefs="battaglia2013simulation,lerer2016learning,zhang2016comparative,piloto2022learn,smith2019modeling,smith2020fine" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>.</p>
    </para>
<!--  %****␣svrhm_2022.tex␣Line␣75␣**** -->    <para xml:id="S1.p3">
      <p>Even though there are a number of models for physical reasoning, they typically focus on reproducing adult-level performance. The goal of the present paper is to instead – in the spirit of Turing – compare the learning trajectories of artificial systems to the developmental trajectories of children. We are particularly interested in examining the idea of <emph font="italic">development as stochastic optimization</emph>, which states that cognitive development results from some form of stochastic optimization procedure <cite class="ltx_citemacro_citep">(<bibref bibrefs="gopnik2017changes,ullman2020bayesian,giron2022developmental" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>.</p>
    </para>
    <pagination role="newpage"/>
    <figure inlist="lof" labels="LABEL:fig:model_graph" placement="!h" xml:id="S1.F1">
      <tags>
        <tag>Figure 1</tag>
        <tag role="autoref">Figure 1</tag>
        <tag role="refnum">1</tag>
        <tag role="typerefnum">Figure 1</tag>
      </tags>
      <graphics candidates="figures/model_graph_2.pdf" class="ltx_centering" graphic="figures/model_graph_2.pdf" options="width=433.62pt" xml:id="S1.F1.g1"/>
      <toccaption class="ltx_centering"><tag close=" ">1</tag>A: Developmental trajectory for support events outlined by <cite class="ltx_citemacro_citep">(<bibref bibrefs="baillargeon1996infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. The illustrations are taken from <cite class="ltx_citemacro_citet"><bibref bibrefs="baillargeon1996infants" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
            <bibrefphrase>(</bibrefphrase>
            <bibrefphrase>)</bibrefphrase>
          </bibref></cite> and they show the physical rules acquired at the respective ages. B: Illustration of our generative video prediction model.</toccaption>
      <caption class="ltx_centering"><tag close=": ">Figure 1</tag>A: Developmental trajectory for support events outlined by <cite class="ltx_citemacro_citep">(<bibref bibrefs="baillargeon1996infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. The illustrations are taken from <cite class="ltx_citemacro_citet"><bibref bibrefs="baillargeon1996infants" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
            <bibrefphrase>(</bibrefphrase>
            <bibrefphrase>)</bibrefphrase>
          </bibref></cite> and they show the physical rules acquired at the respective ages. B: Illustration of our generative video prediction model.</caption>
    </figure>
    <para xml:id="S1.p4">
      <p>To test this hypothesis, we train a deep generative model on video sequences on a physical reasoning task. We then probe the knowledge of this model at different training epochs using violation-of-expectation methods <cite class="ltx_citemacro_citep">(<bibref bibrefs="baillargeon2004infants,piloto2018probing,smith2019modeling" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and compare it to the knowledge of children at different ages. We find that the acquisition order of concepts in this model aligns with that of children, thereby providing support to the idea of development as stochastic optimization.</p>
    </para>
  </section>
  <section inlist="toc" xml:id="S2">
    <tags>
      <tag>2</tag>
      <tag role="autoref">section 2</tag>
      <tag role="refnum">2</tag>
      <tag role="typerefnum">§2</tag>
    </tags>
    <title><tag close=" ">2</tag>Methods</title>
    <subsection inlist="toc" labels="LABEL:sec:stimuli" xml:id="S2.SS1">
      <tags>
        <tag>2.1</tag>
        <tag role="autoref">subsection 2.1</tag>
        <tag role="refnum">2.1</tag>
        <tag role="typerefnum">§2.1</tag>
      </tags>
      <title><tag close=" ">2.1</tag>Support events</title>
<!--  %Describe␣developmental␣trajectories␣of␣children -->      <para xml:id="S2.SS1.p1">
        <p>Infants’ physical reasoning abilities have been investigated in many different domains. Here, we use support events (such as the configurations of block stacks shown in Figure <ref labelref="LABEL:fig:model_graph"/>A) as an exemplary physical reasoning task for comparing the learning trajectories of artificial systems to the developmental trajectories of children.</p>
      </para>
      <para xml:id="S2.SS1.p2">
        <p><cite class="ltx_citemacro_citet"><bibref bibrefs="baillargeon1996infants" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite> has shown that, as infants grow older, they make use of increasingly complex rules to decide whether a specific configuration of blocks is stable or not (see also <cite class="ltx_citemacro_citep">(<bibref bibrefs="baillargeon2002acquisition,baillargeon2004infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>). With <Math mode="inline" tex="3" text="3" xml:id="S2.SS1.p2.m1">
            <XMath>
              <XMTok meaning="3" role="NUMBER">3</XMTok>
            </XMath>
          </Math> months, infants decide based on a simple contact or no contact rule. According to this rule, a block configuration is considered to be stable if the two blocks touch each other. At around <Math mode="inline" tex="5" text="5" xml:id="S2.SS1.p2.m2">
            <XMath>
              <XMTok meaning="5" role="NUMBER">5</XMTok>
            </XMath>
          </Math> months, infants understand that the type of contact matters. Now, only configurations with blocks stacked on top of each other are judged as stable. At <Math mode="inline" tex="6.5" text="6.5" xml:id="S2.SS1.p2.m3">
            <XMath>
              <XMTok meaning="6.5" role="NUMBER">6.5</XMTok>
            </XMath>
          </Math> months, they additionally consider the overlap to determine the stability of a block configuration. Finally, at <Math mode="inline" tex="12.5" text="12.5" xml:id="S2.SS1.p2.m4">
            <XMath>
              <XMTok meaning="12.5" role="NUMBER">12.5</XMTok>
            </XMath>
          </Math> months they are able to incorporate the block shapes into their judgement, relying not only on the amount of contact but also on how the mass is distributed for each block.</p>
      </para>
<!--  %Describe␣our␣training␣data -->      <para xml:id="S2.SS1.p3">
        <p>To assess the learning trajectories of artificial systems, we generated a data set containing <Math mode="inline" tex="100.000" text="100.000" xml:id="S2.SS1.p3.m1">
            <XMath>
              <XMTok meaning="100.000" role="NUMBER">100.000</XMTok>
            </XMath>
          </Math> video sequences of support events using the Unity game engine <cite class="ltx_citemacro_citep">(<bibref bibrefs="unity" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. The video sequences show stacks of two coloured blocks in a gray room and they consist of <Math mode="inline" tex="20" text="20" xml:id="S2.SS1.p3.m2">
            <XMath>
              <XMTok meaning="20" role="NUMBER">20</XMTok>
            </XMath>
          </Math> frames with a size of <Math mode="inline" tex="64\times 64" text="64 * 64" xml:id="S2.SS1.p3.m3">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">×</XMTok>
                <XMTok meaning="64" role="NUMBER">64</XMTok>
                <XMTok meaning="64" role="NUMBER">64</XMTok>
              </XMApp>
            </XMath>
          </Math> pixels (see Figure <ref labelref="LABEL:fig:kl_over_frames"/>). We randomly varied a number of properties to ensure sufficient variability in the training data (see Appendix <ref labelref="LABEL:sec:stim_var"/> for further details).
<!--  %****␣svrhm_2022.tex␣Line␣100␣**** --></p>
      </para>
<!--  %Prove␣models␣work:␣reconstructions␣and␣KL␣over␣time -->      <figure inlist="lof" labels="LABEL:fig:kl_over_frames" placement="!h" xml:id="S2.F2">
        <tags>
          <tag>Figure 2</tag>
          <tag role="autoref">Figure 2</tag>
          <tag role="refnum">2</tag>
          <tag role="typerefnum">Figure 2</tag>
        </tags>
        <graphics candidates="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_2_rm_3_open.pdf" class="ltx_centering" graphic="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_2_rm_3_open.pdf" options="width=433.62pt" xml:id="S2.F2.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">2</tag>The first row shows the surprise for the expected and violated test sequences of the overlap rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</toccaption>
        <caption class="ltx_centering"><tag close=": ">Figure 2</tag>The first row shows the surprise for the expected and violated test sequences of the overlap rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</caption>
      </figure>
    </subsection>
    <subsection inlist="toc" labels="LABEL:eq:rssm_loss LABEL:sec:model" xml:id="S2.SS2">
      <tags>
        <tag>2.2</tag>
        <tag role="autoref">subsection 2.2</tag>
        <tag role="refnum">2.2</tag>
        <tag role="typerefnum">§2.2</tag>
      </tags>
      <title><tag close=" ">2.2</tag>Modeling developmental trajectories</title>
<!--  %How␣do␣we␣implement␣stochastic␣optimization? -->      <para xml:id="S2.SS2.p1">
        <p>We investigate the development as stochastic optimization hypothesis by training a generative video prediction model using gradient descent. To obtain a learning trajectory of this model, we evaluate a snapshot of it in every epoch.</p>
      </para>
      <para xml:id="S2.SS2.p2">
        <p>We use the recurrent state space model (RSSM) <cite class="ltx_citemacro_citep">(<bibref bibrefs="hafner2019learning,saxena2021clockwork" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> as an exemplary model for our analysis. The RSSM can be seen as a sequential version of a variational autoencoder (VAE). It maintains a latent state at each time step, which is comprised of a deterministic component <Math mode="inline" tex="h_{t}" text="h _ t" xml:id="S2.SS2.p2.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">h</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> and a stochastic component <Math mode="inline" tex="s_{t}" text="s _ t" xml:id="S2.SS2.p2.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math>. These components depend on the previous time steps through a function <Math mode="inline" tex="f(h_{t-1},s_{t-1})" text="f * open-interval@(h _ (t - 1), s _ (t - 1))" xml:id="S2.SS2.p2.m3">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMTok font="italic" role="UNKNOWN">f</XMTok>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="open-interval"/>
                    <XMRef idref="S2.SS2.p2.m3.1"/>
                    <XMRef idref="S2.SS2.p2.m3.2"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S2.SS2.p2.m3.1">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">h</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S2.SS2.p2.m3.2">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">s</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>, which is implemented as a gated recurrent neural network. It is trained by optimizing the following evidence lower bound:</p>
        <equationgroup class="ltx_eqn_align" xml:id="A1.EGx1">
          <equation xml:id="S2.E1">
            <tags>
              <tag>(1)</tag>
              <tag role="autoref">Equation 1</tag>
              <tag role="refnum">1</tag>
            </tags>
            <MathFork>
              <Math tex="\displaystyle-\sum_{t=1}^{T}\mathbb{E}_{q(s_{t}\mid o_{\leq t})}[\textrm{ln }p%&#10;(o_{t}\mid s_{t})]+\mathbb{E}_{q(s_{t-1}\mid o_{\leq t-1})}\big{[}\textrm{KL}(%&#10;q(s_{t}\mid o_{\leq t})\mid\mid p(s_{t}\mid s_{t-1}))\big{]}" xml:id="S2.E1.m2">
                <XMath>
                  <XMTok meaning="minus" role="ADDOP">-</XMTok>
                  <XMApp scriptpos="mid">
                    <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                    <XMApp scriptpos="mid">
                      <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                      <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="blackboard" role="UNKNOWN">E</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">q</XMTok>
                      <XMDual>
                        <XMRef idref="S2.E1.m2.1"/>
                        <XMWrap>
                          <XMTok fontsize="70%" role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="S2.E1.m2.1">
                            <XMTok fontsize="70%" meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                              <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">o</XMTok>
                              <XMApp>
                                <XMTok fontsize="50%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                                <XMTok meaning="absent"/>
                                <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                            </XMApp>
                          </XMApp>
                          <XMTok fontsize="70%" role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">[</XMTok>
                    <XMText>ln </XMText>
                    <XMTok font="italic" role="UNKNOWN">p</XMTok>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">o</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                      <XMTok name="mid" role="VERTBAR">∣</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                    <XMTok role="CLOSE" stretchy="false">]</XMTok>
                  </XMWrap>
                  <XMTok meaning="plus" role="ADDOP">+</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="blackboard" role="UNKNOWN">E</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">q</XMTok>
                      <XMDual>
                        <XMRef idref="S2.E1.m2.2"/>
                        <XMWrap>
                          <XMTok fontsize="70%" role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="S2.E1.m2.2">
                            <XMTok fontsize="70%" meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                              <XMApp>
                                <XMTok fontsize="50%" meaning="minus" role="ADDOP">-</XMTok>
                                <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                <XMTok fontsize="50%" meaning="1" role="NUMBER">1</XMTok>
                              </XMApp>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">o</XMTok>
                              <XMApp>
                                <XMTok fontsize="50%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                                <XMTok meaning="absent"/>
                                <XMApp>
                                  <XMTok fontsize="50%" meaning="minus" role="ADDOP">-</XMTok>
                                  <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                  <XMTok fontsize="50%" meaning="1" role="NUMBER">1</XMTok>
                                </XMApp>
                              </XMApp>
                            </XMApp>
                          </XMApp>
                          <XMTok fontsize="70%" role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                  <XMWrap>
                    <XMTok fontsize="120%" role="OPEN" stretchy="false">[</XMTok>
                    <XMText>KL</XMText>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMTok font="italic" role="UNKNOWN">q</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">s</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok name="mid" role="VERTBAR">∣</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">o</XMTok>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                            <XMTok meaning="absent"/>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok name="mid" role="VERTBAR">∣</XMTok>
                      <XMTok name="mid" role="VERTBAR">∣</XMTok>
                      <XMTok font="italic" role="UNKNOWN">p</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">s</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok name="mid" role="VERTBAR">∣</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="italic" role="UNKNOWN">s</XMTok>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                          </XMApp>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                    <XMTok fontsize="120%" role="CLOSE" stretchy="false">]</XMTok>
                  </XMWrap>
                </XMath>
              </Math>
              <MathBranch>
                <td align="right"><Math mode="inline" tex="\displaystyle-\sum_{t=1}^{T}\mathbb{E}_{q(s_{t}\mid o_{\leq t})}[\textrm{ln }p%&#10;(o_{t}\mid s_{t})]+\mathbb{E}_{q(s_{t-1}\mid o_{\leq t-1})}\big{[}\textrm{KL}(%&#10;q(s_{t}\mid o_{\leq t})\mid\mid p(s_{t}\mid s_{t-1}))\big{]}" xml:id="S2.E1.m1">
                    <XMath>
                      <XMTok meaning="minus" role="ADDOP">-</XMTok>
                      <XMApp scriptpos="mid">
                        <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                        <XMApp scriptpos="mid">
                          <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                          <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                          </XMApp>
                        </XMApp>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">T</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="blackboard" role="UNKNOWN">E</XMTok>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">q</XMTok>
                          <XMDual>
                            <XMRef idref="S2.E1.m1.1"/>
                            <XMWrap>
                              <XMTok fontsize="70%" role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="S2.E1.m1.1">
                                <XMTok fontsize="70%" meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                                  <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">o</XMTok>
                                  <XMApp>
                                    <XMTok fontsize="50%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                                    <XMTok meaning="absent"/>
                                    <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                  </XMApp>
                                </XMApp>
                              </XMApp>
                              <XMTok fontsize="70%" role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">[</XMTok>
                        <XMText>ln </XMText>
                        <XMTok font="italic" role="UNKNOWN">p</XMTok>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">o</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok name="mid" role="VERTBAR">∣</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">s</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                        <XMTok role="CLOSE" stretchy="false">]</XMTok>
                      </XMWrap>
                      <XMTok meaning="plus" role="ADDOP">+</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="blackboard" role="UNKNOWN">E</XMTok>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">q</XMTok>
                          <XMDual>
                            <XMRef idref="S2.E1.m1.2"/>
                            <XMWrap>
                              <XMTok fontsize="70%" role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="S2.E1.m1.2">
                                <XMTok fontsize="70%" meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                                  <XMApp>
                                    <XMTok fontsize="50%" meaning="minus" role="ADDOP">-</XMTok>
                                    <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                    <XMTok fontsize="50%" meaning="1" role="NUMBER">1</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">o</XMTok>
                                  <XMApp>
                                    <XMTok fontsize="50%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                                    <XMTok meaning="absent"/>
                                    <XMApp>
                                      <XMTok fontsize="50%" meaning="minus" role="ADDOP">-</XMTok>
                                      <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                      <XMTok fontsize="50%" meaning="1" role="NUMBER">1</XMTok>
                                    </XMApp>
                                  </XMApp>
                                </XMApp>
                              </XMApp>
                              <XMTok fontsize="70%" role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                      <XMWrap>
                        <XMTok fontsize="120%" role="OPEN" stretchy="false">[</XMTok>
                        <XMText>KL</XMText>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMTok font="italic" role="UNKNOWN">q</XMTok>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">s</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok name="mid" role="VERTBAR">∣</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">o</XMTok>
                              <XMApp>
                                <XMTok fontsize="70%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                                <XMTok meaning="absent"/>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok name="mid" role="VERTBAR">∣</XMTok>
                          <XMTok name="mid" role="VERTBAR">∣</XMTok>
                          <XMTok font="italic" role="UNKNOWN">p</XMTok>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">s</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok name="mid" role="VERTBAR">∣</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">s</XMTok>
                              <XMApp>
                                <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                              </XMApp>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                        <XMTok fontsize="120%" role="CLOSE" stretchy="false">]</XMTok>
                      </XMWrap>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
        </equationgroup>
      </para>
      <para xml:id="S2.SS2.p3">
        <p>After training, the RSSM can be used to generate open-loop predictions. For this, the model processes a number of initial observations to infer an approximate posterior <Math mode="inline" tex="q(s_{t-1}\mid o_{\leq t-1})" text="q * conditional@(s _ (t - 1), o _ (absent less= t - 1))" xml:id="S2.SS2.p3.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMTok font="italic" role="UNKNOWN">q</XMTok>
                <XMDual>
                  <XMRef idref="S2.SS2.p3.m1.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S2.SS2.p3.m1.1">
                      <XMTok meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">o</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="less-than-or-equals" name="leq" role="RELOP">≤</XMTok>
                          <XMTok meaning="absent"/>
                          <XMApp>
                            <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                          </XMApp>
                        </XMApp>
                      </XMApp>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>. The subsequent open-loop predictions then result by decoding latent representations sampled from the prior <Math mode="inline" tex="p(s_{t}\mid s_{t-1})" text="p * conditional@(s _ t, s _ (t - 1))" xml:id="S2.SS2.p3.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMTok font="italic" role="UNKNOWN">p</XMTok>
                <XMDual>
                  <XMRef idref="S2.SS2.p3.m2.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S2.SS2.p3.m2.1">
                      <XMTok meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        </XMApp>
                      </XMApp>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>. We refer the reader to Appendix <ref labelref="LABEL:sec:model_imp"/> for further details about the model architecture and training procedure.</p>
      </para>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:measuring_surprise" xml:id="S2.SS3">
      <tags>
        <tag>2.3</tag>
        <tag role="autoref">subsection 2.3</tag>
        <tag role="refnum">2.3</tag>
        <tag role="typerefnum">§2.3</tag>
      </tags>
      <title><tag close=" ">2.3</tag>Measuring surprise</title>
<!--  %****␣svrhm_2022.tex␣Line␣125␣**** 
     %How␣do␣we␣assess␣the␣development?-->      <para xml:id="S2.SS3.p1">
        <p>In order to assess whether our model has understood a specific rule, we use the violation of expectation paradigm <cite class="ltx_citemacro_citep">(<bibref bibrefs="piloto2018probing,piloto2022learn" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. The model is presented with two video sequences: a violated sequence, which constitutes a violation according to the rule, and an expected sequence, which is consistent with the rule. If the model has successfully learned a specific rule, it should show a larger degree of surprise for the violated compared to the expected sequence. Following <cite class="ltx_citemacro_citet"><bibref bibrefs="piloto2018probing" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>, we measure the model’s surprise using the Kullback–Leibler (KL) divergence between the prior and posterior over the latent representation <cite class="ltx_citemacro_citep">(<bibref bibrefs="baldi2010bits" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. This approach closely resembles how developmental psychologists assess children’s understanding of physical rules.</p>
      </para>
      <para xml:id="S2.SS3.p2">
        <p>We constructed a violated and an expected test sequence with identical image statistics for each of the four rules for support events outlined by <cite class="ltx_citemacro_citet"><bibref bibrefs="baillargeon1996infants" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>. For example, according to the overlap rule, a block configuration should only be stable if the blocks are stacked on top of each other with enough overlap. The two test sequences for this rule therefore show two blocks that only slightly overlap (see Figure <ref labelref="LABEL:fig:kl_over_frames"/>). In the violated test sequence, the block configuration nonetheless appears stable, with the top block remaining on top of the bottom block. In contrast and consistent with the rule, the expected test sequence shows the top block falling.</p>
      </para>
<!--  %Show␣results␣that␣relate␣to␣hypothesis:␣delta␣KL␣over␣epochs -->      <figure inlist="lof" labels="LABEL:fig:kl_over_epochs" placement="!t" xml:id="S2.F3">
        <tags>
          <tag>Figure 3</tag>
          <tag role="autoref">Figure 3</tag>
          <tag role="refnum">3</tag>
          <tag role="typerefnum">Figure 3</tag>
        </tags>
        <graphics candidates="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_stochastic_optimisation-2.pdf" class="ltx_centering" graphic="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_stochastic_optimisation-2.pdf" options="width=433.62pt" xml:id="S2.F3.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">3</tag>Linear regression fits for the difference between the surprise for the expected and the violated sequence at every epoch and for each of the four physical principles separately, as well as a 95% confidence interval around the regression line. Additionally, the individual data points are binned in bins of size 10 and displayed with a 95% confidence interval.</toccaption>
        <caption class="ltx_centering"><tag close=": ">Figure 3</tag>Linear regression fits for the difference between the surprise for the expected and the violated sequence at every epoch and for each of the four physical principles separately, as well as a 95% confidence interval around the regression line. Additionally, the individual data points are binned in bins of size 10 and displayed with a 95% confidence interval.</caption>
      </figure>
    </subsection>
  </section>
  <section inlist="toc" labels="LABEL:sec:results" xml:id="S3">
    <tags>
      <tag>3</tag>
      <tag role="autoref">section 3</tag>
      <tag role="refnum">3</tag>
      <tag role="typerefnum">§3</tag>
    </tags>
    <title><tag close=" ">3</tag>Results</title>
<!--  %Prove␣that␣reconstructions␣are␣sensible -->    <para xml:id="S3.p1">
      <p>Before investigating the learning trajectory of our model, we verified that it is able to predict the given scenes accurately into the future. For this purpose, we plotted the open-loop predictions of the fully trained model for all test sequences. Figure <ref labelref="LABEL:fig:kl_over_frames"/> illustrates the result for the overlap test sequences. We see that the predictions of the fully trained model closely match the expected sequence, indicating that the model has learned the overlap rule by the end of its training. This result also holds for the three other physical rules, see Appendix <ref labelref="LABEL:sec:figures"/> for the corresponding plots.</p>
    </para>
<!--  %Prove␣that␣model␣is␣surprised␣when␣it␣sees␣violations -->    <para xml:id="S3.p2">
      <p>Next, we validated that the fully trained model is surprised when it observes a violation of a physical rule it has acquired. Figure <ref labelref="LABEL:fig:kl_over_frames"/> shows the KL divergence over the frames for the overlap test sequences. We see that the KL divergence for the violated sequence exceeds that for the expected sequence from around the fifth frame, which is when the two sequences diverge.</p>
    </para>
<!--  %Relate␣results␣to␣developmental␣trajectories -->    <para xml:id="S3.p3">
      <p>To quantitatively assess whether the model has learned the four physical rules, we computed the difference in KL divergence between the expected and violated sequences summed up over all frames. Figure <ref labelref="LABEL:fig:kl_over_epochs"/> shows this measure for each of the four physical rules over the course of training. We find that the difference in KL divergence is positive for all four physical rules in the fully trained model, confirming that it has learned all of them by the end of training.
<!--  %****␣svrhm_2022.tex␣Line␣150␣**** --></p>
    </para>
    <para xml:id="S3.p4">
      <p>Finally, we investigated whether the stochastic optimization hypothesis can capture the developmental trajectories of children. For this purpose, we fitted a simple linear regression for each of the rules, using the difference in KL divergence as the dependent and training epochs as the independent variable. This directly relates to the stochastic optimization hypothesis, as the model becomes increasingly optimized over the epochs. The resulting regression coefficients were <Math mode="inline" tex="0.390\pm 0.024" text="0.390 plus-or-minus 0.024" xml:id="S3.p4.m1">
          <XMath>
            <XMApp>
              <XMTok meaning="plus-or-minus" name="pm" role="ADDOP">±</XMTok>
              <XMTok meaning="0.390" role="NUMBER">0.390</XMTok>
              <XMTok meaning="0.024" role="NUMBER">0.024</XMTok>
            </XMApp>
          </XMath>
        </Math>, <Math mode="inline" tex="0.261\pm 0.017" text="0.261 plus-or-minus 0.017" xml:id="S3.p4.m2">
          <XMath>
            <XMApp>
              <XMTok meaning="plus-or-minus" name="pm" role="ADDOP">±</XMTok>
              <XMTok meaning="0.261" role="NUMBER">0.261</XMTok>
              <XMTok meaning="0.017" role="NUMBER">0.017</XMTok>
            </XMApp>
          </XMath>
        </Math>, <Math mode="inline" tex="0.170\pm 0.011" text="0.170 plus-or-minus 0.011" xml:id="S3.p4.m3">
          <XMath>
            <XMApp>
              <XMTok meaning="plus-or-minus" name="pm" role="ADDOP">±</XMTok>
              <XMTok meaning="0.170" role="NUMBER">0.170</XMTok>
              <XMTok meaning="0.011" role="NUMBER">0.011</XMTok>
            </XMApp>
          </XMath>
        </Math>, and <Math mode="inline" tex="0.059\pm 0.007" text="0.059 plus-or-minus 0.007" xml:id="S3.p4.m4">
          <XMath>
            <XMApp>
              <XMTok meaning="plus-or-minus" name="pm" role="ADDOP">±</XMTok>
              <XMTok meaning="0.059" role="NUMBER">0.059</XMTok>
              <XMTok meaning="0.007" role="NUMBER">0.007</XMTok>
            </XMApp>
          </XMath>
        </Math> for the contact or no contact, type of contact, overlap, and shape rules, respectively (all <Math mode="inline" tex="p&lt;0.001" text="p less 0.001" xml:id="S3.p4.m5">
          <XMath>
            <XMApp>
              <XMTok meaning="less-than" role="RELOP">&lt;</XMTok>
              <XMTok font="italic" role="UNKNOWN">p</XMTok>
              <XMTok meaning="0.001" role="NUMBER">0.001</XMTok>
            </XMApp>
          </XMath>
        </Math>). We can see that the coefficients become smaller as the rules increase in complexity. The model first acquires the contact or no contact, followed by the type of contact, then the overlap, and finally the shape rule. This order of acquisition matches how children acquire these rules, thereby providing support to the idea of development as stochastic optimization.</p>
    </para>
  </section>
  <section inlist="toc" labels="LABEL:sec:discussion" xml:id="S4">
    <tags>
      <tag>4</tag>
      <tag role="autoref">section 4</tag>
      <tag role="refnum">4</tag>
      <tag role="typerefnum">§4</tag>
    </tags>
    <title><tag close=" ">4</tag>Discussion</title>
<!--  %Summary -->    <para xml:id="S4.p1">
      <p>We have compared the learning trajectories of an artificial system to the developmental trajectories of children in the domain of physical reasoning. More specifically, we examined the idea of development as stochastic optimization. For this purpose, we used the violation of expectation paradigm to probe the knowledge of a modern deep generative neural network at different stages of its training process. We found that the model’s learning trajectory resembled the trajectories of children during their cognitive development.</p>
    </para>
<!--  %strength -->    <para xml:id="S4.p2">
      <p>In contrast to previous work in this domain <cite class="ltx_citemacro_citep">(<bibref bibrefs="binz2019emulating,giron2022developmental" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, our modeling approach employs high-dimensional visual stimuli (i.e., video sequences) and solely relies on an unsupervised training objective. It, therefore, more closely mirrors the actual learning processes of children in the real world. Furthermore, the conducted experiments build on a well-established paradigm from developmental psychology. Together, these factors provide a high validity to our results.</p>
    </para>
<!--  %development␣through␣data␣aspects -->    <para xml:id="S4.p3">
      <p>Presently, the biggest mismatch between learning in our models and learning in children is that the latter do not observe a large number of support events. Instead, they simply witness the real world and generalize their acquired knowledge to the given experimental setting. To capture this process, we should ideally train our models in a similar way. This could, for example, be accomplished by utilizing the SAYCam data set, which contains a large number of longitudinal video recordings from infants’ perspectives <cite class="ltx_citemacro_citep">(<bibref bibrefs="sullivan2021saycam" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Additionally, this data set includes time stamps indicating when a child has encountered a particular scene, which could be used to investigate how the nature of the training data influences development.</p>
    </para>
<!--  %development␣as␣complexity␣increase -->    <para xml:id="S4.p4">
      <p>It also seems plausible that factors beyond stochastic optimization drive human development. There is, for example, evidence suggesting that children gain access to additional computational resources during their development, allowing them to apply more complex strategies <cite class="ltx_citemacro_citep">(<bibref bibrefs="binz2022exploration" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. This idea of <emph font="italic">development as complexity increase</emph> could be readily incorporated in our framework by replacing the evidence lower bound from Equation <ref labelref="LABEL:eq:rssm_loss"/> with a <Math mode="inline" tex="\beta" text="beta" xml:id="S4.p4.m1">
          <XMath>
            <XMTok font="italic" name="beta" role="UNKNOWN">β</XMTok>
          </XMath>
        </Math>-VAE objective <cite class="ltx_citemacro_citep">(<bibref bibrefs="higgins2016beta,burgess2018understanding" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>.</p>
    </para>
<!--  %conclusion -->    <para xml:id="S4.p5">
      <p>To fully disentangle these hypotheses, a single paradigm will likely not be sufficient. Instead, we will need to go beyond support events and investigate developmental trajectories across a variety of different tasks. Findings that hold across domains would further increase the reliability of our results and have potential implications for cognitive psychology and artificial intelligence alike.</p>
    </para>
    <ERROR class="undefined">{ack}</ERROR>
    <para xml:id="S4.p6">
      <p>This work was funded by the Max Planck Society and the Volkswagen Foundation.</p>
    </para>
<!--  %****␣svrhm_2022.tex␣Line␣175␣**** 
     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%-->    <pagination role="newpage"/>
  </section>
  <bibliography citestyle="authoryear" files="svrhm_2022" xml:id="bib">
    <title>References</title>
  </bibliography>
<!--  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -->  <pagination role="newpage"/>
  <section xml:id="Sx1">
    <title>Checklist</title>
<!--  %\answerYes{See␣Section~\ref{gen_inst}.} 
     %\answerNo{The␣code␣and␣the␣data␣are␣proprietary.}
     %\answerNA{}-->    <para xml:id="Sx1.p1">
      <enumerate xml:id="Sx1.I1">
        <item xml:id="Sx1.I1.i1">
          <tags>
            <tag>1.</tag>
            <tag role="autoref">item 1</tag>
            <tag role="refnum">1</tag>
            <tag role="typerefnum">item 1</tag>
          </tags>
          <para xml:id="Sx1.I1.i1.p1">
            <p>For all authors…</p>
            <enumerate xml:id="Sx1.I1.I1">
              <item xml:id="Sx1.I1.i1.i1">
                <tags>
                  <tag>(a)</tag>
                  <tag role="autoref">item a</tag>
                  <tag role="refnum">1a</tag>
                  <tag role="typerefnum">item 1a</tag>
                </tags>
                <para xml:id="Sx1.I1.i1.i1.p1">
                  <p>Do the main claims made in the abstract and introduction accurately reflect the paper’s contributions and scope?
<ERROR class="undefined">\answerYes</ERROR>Our main claims accurately reflect the paper’s contributions.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i1.i2">
                <tags>
                  <tag>(b)</tag>
                  <tag role="autoref">item b</tag>
                  <tag role="refnum">1b</tag>
                  <tag role="typerefnum">item 1b</tag>
                </tags>
                <para xml:id="Sx1.I1.i1.i2.p1">
                  <p>Did you describe the limitations of your work?
<ERROR class="undefined">\answerYes</ERROR>We have described the limitations of our work, see section <ref labelref="LABEL:sec:discussion"/>.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i1.i3">
                <tags>
                  <tag>(c)</tag>
                  <tag role="autoref">item c</tag>
                  <tag role="refnum">1c</tag>
                  <tag role="typerefnum">item 1c</tag>
                </tags>
                <para xml:id="Sx1.I1.i1.i3.p1">
                  <p>Did you discuss any potential negative societal impacts of your work?
<ERROR class="undefined">\answerNo</ERROR>We do not think that there are negative societal impacts as a result of this work.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i1.i4">
                <tags>
                  <tag>(d)</tag>
                  <tag role="autoref">item d</tag>
                  <tag role="refnum">1d</tag>
                  <tag role="typerefnum">item 1d</tag>
                </tags>
                <para xml:id="Sx1.I1.i1.i4.p1">
                  <p>Have you read the ethics review guidelines and ensured that your paper conforms to them?
<!--  %****␣svrhm_2022.tex␣Line␣200␣**** --><ERROR class="undefined">\answerYes</ERROR>We have read the ethics review guidelines.</p>
                </para>
              </item>
            </enumerate>
          </para>
        </item>
        <item xml:id="Sx1.I1.i2">
          <tags>
            <tag>2.</tag>
            <tag role="autoref">item 2</tag>
            <tag role="refnum">2</tag>
            <tag role="typerefnum">item 2</tag>
          </tags>
          <para xml:id="Sx1.I1.i2.p1">
            <p>If you are including theoretical results…</p>
            <enumerate xml:id="Sx1.I1.I2">
              <item xml:id="Sx1.I1.i2.i1">
                <tags>
                  <tag>(a)</tag>
                  <tag role="autoref">item a</tag>
                  <tag role="refnum">2a</tag>
                  <tag role="typerefnum">item 2a</tag>
                </tags>
                <para xml:id="Sx1.I1.i2.i1.p1">
                  <p>Did you state the full set of assumptions of all theoretical results?
<ERROR class="undefined">\answerNA</ERROR>We do not include theoretical results.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i2.i2">
                <tags>
                  <tag>(b)</tag>
                  <tag role="autoref">item b</tag>
                  <tag role="refnum">2b</tag>
                  <tag role="typerefnum">item 2b</tag>
                </tags>
                <para xml:id="Sx1.I1.i2.i2.p1">
                  <p>Did you include complete proofs of all theoretical results?
<ERROR class="undefined">\answerNA</ERROR>We do not include theoretical results.</p>
                </para>
              </item>
            </enumerate>
          </para>
        </item>
        <item xml:id="Sx1.I1.i3">
          <tags>
            <tag>3.</tag>
            <tag role="autoref">item 3</tag>
            <tag role="refnum">3</tag>
            <tag role="typerefnum">item 3</tag>
          </tags>
          <para xml:id="Sx1.I1.i3.p1">
            <p>If you ran experiments…</p>
            <enumerate xml:id="Sx1.I1.I3">
              <item xml:id="Sx1.I1.i3.i1">
                <tags>
                  <tag>(a)</tag>
                  <tag role="autoref">item a</tag>
                  <tag role="refnum">3a</tag>
                  <tag role="typerefnum">item 3a</tag>
                </tags>
                <para xml:id="Sx1.I1.i3.i1.p1">
                  <p>Did you include the code, data, and instructions needed to reproduce the main experimental results (either in the supplemental material or as a URL)?
<ERROR class="undefined">\answerNo</ERROR>However, the code, data, and instructions are available upon request.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i3.i2">
                <tags>
                  <tag>(b)</tag>
                  <tag role="autoref">item b</tag>
                  <tag role="refnum">3b</tag>
                  <tag role="typerefnum">item 3b</tag>
                </tags>
                <para xml:id="Sx1.I1.i3.i2.p1">
                  <p>Did you specify all the training details (e.g., data splits, hyperparameters, how they were chosen)?
<ERROR class="undefined">\answerYes</ERROR>We have specified the training details, see sections <ref labelref="LABEL:sec:model"/> and <ref labelref="LABEL:sec:model_imp"/>.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i3.i3">
                <tags>
                  <tag>(c)</tag>
                  <tag role="autoref">item c</tag>
                  <tag role="refnum">3c</tag>
                  <tag role="typerefnum">item 3c</tag>
                </tags>
                <para xml:id="Sx1.I1.i3.i3.p1">
                  <p>Did you report error bars (e.g., with respect to the random seed after running experiments multiple times)?
<ERROR class="undefined">\answerYes</ERROR>We report error bars for the regression analysis (see section <ref labelref="LABEL:sec:results"/>) and in figure <ref labelref="LABEL:fig:kl_over_epochs"/>.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i3.i4">
                <tags>
                  <tag>(d)</tag>
                  <tag role="autoref">item d</tag>
                  <tag role="refnum">3d</tag>
                  <tag role="typerefnum">item 3d</tag>
                </tags>
                <para xml:id="Sx1.I1.i3.i4.p1">
                  <p>Did you include the total amount of compute and the type of resources used (e.g., type of GPUs, internal cluster, or cloud provider)?
<ERROR class="undefined">\answerNo</ERROR>We did not track the total amount of compute used for testing. However, we have mentioned the training time and GPU used, see section <ref labelref="LABEL:sec:model_imp"/>.</p>
                </para>
              </item>
            </enumerate>
          </para>
<!--  %****␣svrhm_2022.tex␣Line␣225␣**** -->        </item>
        <item xml:id="Sx1.I1.i4">
          <tags>
            <tag>4.</tag>
            <tag role="autoref">item 4</tag>
            <tag role="refnum">4</tag>
            <tag role="typerefnum">item 4</tag>
          </tags>
          <para xml:id="Sx1.I1.i4.p1">
            <p>If you are using existing assets (e.g., code, data, models) or curating/releasing new assets…</p>
            <enumerate xml:id="Sx1.I1.I4">
              <item xml:id="Sx1.I1.i4.i1">
                <tags>
                  <tag>(a)</tag>
                  <tag role="autoref">item a</tag>
                  <tag role="refnum">4a</tag>
                  <tag role="typerefnum">item 4a</tag>
                </tags>
                <para xml:id="Sx1.I1.i4.i1.p1">
                  <p>If your work uses existing assets, did you cite the creators?
<ERROR class="undefined">\answerYes</ERROR>Our model implementation builds on a previous implementation, which we cited in section <ref labelref="LABEL:sec:model_imp"/>.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i4.i2">
                <tags>
                  <tag>(b)</tag>
                  <tag role="autoref">item b</tag>
                  <tag role="refnum">4b</tag>
                  <tag role="typerefnum">item 4b</tag>
                </tags>
                <para xml:id="Sx1.I1.i4.i2.p1">
                  <p>Did you mention the license of the assets?
<ERROR class="undefined">\answerNo</ERROR>We did not mention the licence in the manuscript, however the previous implementation is licensed under the MIT License.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i4.i3">
                <tags>
                  <tag>(c)</tag>
                  <tag role="autoref">item c</tag>
                  <tag role="refnum">4c</tag>
                  <tag role="typerefnum">item 4c</tag>
                </tags>
                <para xml:id="Sx1.I1.i4.i3.p1">
                  <p>Did you include any new assets either in the supplemental material or as a URL?
<ERROR class="undefined">\answerNo</ERROR>However, the data set we created is available upon request.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i4.i4">
                <tags>
                  <tag>(d)</tag>
                  <tag role="autoref">item d</tag>
                  <tag role="refnum">4d</tag>
                  <tag role="typerefnum">item 4d</tag>
                </tags>
                <para xml:id="Sx1.I1.i4.i4.p1">
                  <p>Did you discuss whether and how consent was obtained from people whose data you’re using/curating?
<ERROR class="undefined">\answerNA</ERROR>We did not use or curate data from other people.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i4.i5">
                <tags>
                  <tag>(e)</tag>
                  <tag role="autoref">item e</tag>
                  <tag role="refnum">4e</tag>
                  <tag role="typerefnum">item 4e</tag>
                </tags>
                <para xml:id="Sx1.I1.i4.i5.p1">
                  <p>Did you discuss whether the data you are using/curating contains personally identifiable information or offensive content?
<ERROR class="undefined">\answerNA</ERROR>We did not use or curate data from other people.</p>
                </para>
              </item>
            </enumerate>
          </para>
        </item>
        <item xml:id="Sx1.I1.i5">
          <tags>
            <tag>5.</tag>
            <tag role="autoref">item 5</tag>
            <tag role="refnum">5</tag>
            <tag role="typerefnum">item 5</tag>
          </tags>
          <para xml:id="Sx1.I1.i5.p1">
            <p>If you used crowdsourcing or conducted research with human subjects…</p>
            <enumerate xml:id="Sx1.I1.I5">
              <item xml:id="Sx1.I1.i5.i1">
                <tags>
                  <tag>(a)</tag>
                  <tag role="autoref">item a</tag>
                  <tag role="refnum">5a</tag>
                  <tag role="typerefnum">item 5a</tag>
                </tags>
                <para xml:id="Sx1.I1.i5.i1.p1">
                  <p>Did you include the full text of instructions given to participants and screenshots, if applicable?
<ERROR class="undefined">\answerNA</ERROR>We did not use crowdsourcing or conduct research with human subjects.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i5.i2">
                <tags>
                  <tag>(b)</tag>
                  <tag role="autoref">item b</tag>
                  <tag role="refnum">5b</tag>
                  <tag role="typerefnum">item 5b</tag>
                </tags>
                <para xml:id="Sx1.I1.i5.i2.p1">
                  <p>Did you describe any potential participant risks, with links to Institutional Review Board (IRB) approvals, if applicable?
<ERROR class="undefined">\answerNA</ERROR>We did not use crowdsourcing or conduct research with human subjects.</p>
                </para>
              </item>
              <item xml:id="Sx1.I1.i5.i3">
                <tags>
                  <tag>(c)</tag>
                  <tag role="autoref">item c</tag>
                  <tag role="refnum">5c</tag>
                  <tag role="typerefnum">item 5c</tag>
                </tags>
                <para xml:id="Sx1.I1.i5.i3.p1">
                  <p>Did you include the estimated hourly wage paid to participants and the total amount spent on participant compensation?
<ERROR class="undefined">\answerNA</ERROR>We did not use crowdsourcing or conduct research with human subjects.</p>
                </para>
              </item>
            </enumerate>
<!--  %****␣svrhm_2022.tex␣Line␣250␣**** -->          </para>
        </item>
      </enumerate>
    </para>
<!--  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -->  </section>
  <appendix inlist="toc" xml:id="A1">
    <tags>
      <tag>Appendix A</tag>
      <tag role="autoref">Appendix A</tag>
      <tag role="refnum">A</tag>
      <tag role="typerefnum">Appendix A</tag>
    </tags>
    <title><tag close=" ">Appendix A</tag>Appendix</title>
    <toctitle><tag close=" ">A</tag>Appendix</toctitle>
    <subsection inlist="toc" labels="LABEL:sec:stim_var" xml:id="A1.SS1">
      <tags>
        <tag>A.1</tag>
        <tag role="autoref">subsection A.1</tag>
        <tag role="refnum">A.1</tag>
        <tag role="typerefnum">§A.1</tag>
      </tags>
      <title><tag close=" ">A.1</tag>Stimulus variations</title>
      <para xml:id="A1.SS1.p1">
        <p>The following variables were randomly varied in order to ensure sufficient variability in the data set: lower block size, lower block color, upper block color, lower block rotation, upper block rotation, upper block position (offset) and camera angle. Additionally, the shape of the upper block was varied: half of the trials featured a cube as an upper block, while the other half featured an L-shaped block with randomly sampled side lengths.</p>
      </para>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:model_imp" xml:id="A1.SS2">
      <tags>
        <tag>A.2</tag>
        <tag role="autoref">subsection A.2</tag>
        <tag role="refnum">A.2</tag>
        <tag role="typerefnum">§A.2</tag>
      </tags>
      <title><tag close=" ">A.2</tag>Model implementation</title>
      <para xml:id="A1.SS2.p1">
        <p>The recurrent state space model (RSSM) can be seen as a sequential VAE <cite class="ltx_citemacro_citep">(<bibref bibrefs="hafner2019learning,saxena2021clockwork" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. The deterministic component <Math mode="inline" tex="h_{t}" text="h _ t" xml:id="A1.SS2.p1.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">h</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> and stochastic component <Math mode="inline" tex="s_{t}" text="s _ t" xml:id="A1.SS2.p1.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> depend on the deterministic and stochastic components at the previous time steps through <Math mode="inline" tex="f(h_{t-1},s_{t-1})" text="f * open-interval@(h _ (t - 1), s _ (t - 1))" xml:id="A1.SS2.p1.m3">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMTok font="italic" role="UNKNOWN">f</XMTok>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="open-interval"/>
                    <XMRef idref="A1.SS2.p1.m3.1"/>
                    <XMRef idref="A1.SS2.p1.m3.2"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="A1.SS2.p1.m3.1">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">h</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="A1.SS2.p1.m3.2">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">s</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>, which is implemented through a gated recurrent unit cell. As outlined by <cite class="ltx_citemacro_citet"><bibref bibrefs="hafner2019learning" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>, the individual components of the RSSM are:</p>
      </para>
      <para xml:id="A1.SS2.p2">
        <equationgroup class="ltx_eqn_align" xml:id="A1.EGx2">
          <equation xml:id="A1.Ex1">
            <MathFork>
              <Math tex="\displaystyle\text{Deterministic state model:}h_{t}" text="[Deterministic state model:] * h _ t" xml:id="A1.Ex1.m4">
                <XMath>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMText class="ltx_markedasmath">Deterministic state model:</XMText>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">h</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="left"><text class="ltx_markedasmath">Deterministic state model:</text></td>
                <td align="right"><Math mode="inline" tex="\displaystyle h_{t}" text="h _ t" xml:id="A1.Ex1.m2">
                    <XMath>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">h</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
            <MathFork>
              <Math tex="\displaystyle=f(h_{t-1},s_{t-1})" text="absent = f * open-interval@(h _ (t - 1), s _ (t - 1))" xml:id="A1.Ex1.m5">
                <XMath>
                  <XMApp>
                    <XMTok meaning="equals" role="RELOP">=</XMTok>
                    <XMTok meaning="absent"/>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" role="UNKNOWN">f</XMTok>
                      <XMDual>
                        <XMApp>
                          <XMTok meaning="open-interval"/>
                          <XMRef idref="A1.Ex1.m5.1"/>
                          <XMRef idref="A1.Ex1.m5.2"/>
                        </XMApp>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="A1.Ex1.m5.1">
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">h</XMTok>
                            <XMApp>
                              <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok role="PUNCT">,</XMTok>
                          <XMApp xml:id="A1.Ex1.m5.2">
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="italic" role="UNKNOWN">s</XMTok>
                            <XMApp>
                              <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="left"><Math mode="inline" tex="\displaystyle=f(h_{t-1},s_{t-1})" text="absent = f * open-interval@(h _ (t - 1), s _ (t - 1))" xml:id="A1.Ex1.m3">
                    <XMath>
                      <XMApp>
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMTok meaning="absent"/>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" role="UNKNOWN">f</XMTok>
                          <XMDual>
                            <XMApp>
                              <XMTok meaning="open-interval"/>
                              <XMRef idref="A1.Ex1.m3.1"/>
                              <XMRef idref="A1.Ex1.m3.2"/>
                            </XMApp>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="A1.Ex1.m3.1">
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="italic" role="UNKNOWN">h</XMTok>
                                <XMApp>
                                  <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMTok role="PUNCT">,</XMTok>
                              <XMApp xml:id="A1.Ex1.m3.2">
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                                <XMApp>
                                  <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
          <equation xml:id="A1.Ex2">
            <MathFork>
              <Math tex="\displaystyle\text{Stochastic state model:}s_{t}" text="[Stochastic state model:] * s _ t" xml:id="A1.Ex2.m4">
                <XMath>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMText class="ltx_markedasmath">Stochastic state model:</XMText>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">s</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="left"><text class="ltx_markedasmath">Stochastic state model:</text></td>
                <td align="right"><Math mode="inline" tex="\displaystyle s_{t}" text="s _ t" xml:id="A1.Ex2.m2">
                    <XMath>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">s</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
            <MathFork>
              <Math tex="\displaystyle\sim p(s_{t}\mid h_{t})" text="absent similar-to p * conditional@(s _ t, h _ t)" xml:id="A1.Ex2.m5">
                <XMath>
                  <XMApp>
                    <XMTok meaning="similar-to" name="sim" role="RELOP">∼</XMTok>
                    <XMTok meaning="absent"/>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" role="UNKNOWN">p</XMTok>
                      <XMDual>
                        <XMRef idref="A1.Ex2.m5.1"/>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="A1.Ex2.m5.1">
                            <XMTok meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">s</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">h</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="left"><Math mode="inline" tex="\displaystyle\sim p(s_{t}\mid h_{t})" text="absent similar-to p * conditional@(s _ t, h _ t)" xml:id="A1.Ex2.m3">
                    <XMath>
                      <XMApp>
                        <XMTok meaning="similar-to" name="sim" role="RELOP">∼</XMTok>
                        <XMTok meaning="absent"/>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" role="UNKNOWN">p</XMTok>
                          <XMDual>
                            <XMRef idref="A1.Ex2.m3.1"/>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="A1.Ex2.m3.1">
                                <XMTok meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="italic" role="UNKNOWN">s</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="italic" role="UNKNOWN">h</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
          <equation xml:id="A1.Ex3">
            <MathFork>
              <Math tex="\displaystyle\text{Observation model:}o_{t}" text="[Observation model:] * o _ t" xml:id="A1.Ex3.m4">
                <XMath>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMText class="ltx_markedasmath">Observation model:</XMText>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">o</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="left"><text class="ltx_markedasmath">Observation model:</text></td>
                <td align="right"><Math mode="inline" tex="\displaystyle o_{t}" text="o _ t" xml:id="A1.Ex3.m2">
                    <XMath>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" role="UNKNOWN">o</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
            <MathFork>
              <Math tex="\displaystyle\sim p(o_{t}\mid h_{t},s_{t})" text="absent similar-to p * conditional@(o _ t, list@(h _ t, s _ t))" xml:id="A1.Ex3.m5">
                <XMath>
                  <XMApp>
                    <XMTok meaning="similar-to" name="sim" role="RELOP">∼</XMTok>
                    <XMTok meaning="absent"/>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" role="UNKNOWN">p</XMTok>
                      <XMDual>
                        <XMRef idref="A1.Ex3.m5.1"/>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="A1.Ex3.m5.1">
                            <XMTok meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="italic" role="UNKNOWN">o</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMDual>
                              <XMApp>
                                <XMTok meaning="list"/>
                                <XMRef idref="A1.Ex3.m5.1.1"/>
                                <XMRef idref="A1.Ex3.m5.1.2"/>
                              </XMApp>
                              <XMWrap>
                                <XMApp xml:id="A1.Ex3.m5.1.1">
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="italic" role="UNKNOWN">h</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok role="PUNCT">,</XMTok>
                                <XMApp xml:id="A1.Ex3.m5.1.2">
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="italic" role="UNKNOWN">s</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                              </XMWrap>
                            </XMDual>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                </XMath>
              </Math>
              <MathBranch>
                <td align="left"><Math mode="inline" tex="\displaystyle\sim p(o_{t}\mid h_{t},s_{t})" text="absent similar-to p * conditional@(o _ t, list@(h _ t, s _ t))" xml:id="A1.Ex3.m3">
                    <XMath>
                      <XMApp>
                        <XMTok meaning="similar-to" name="sim" role="RELOP">∼</XMTok>
                        <XMTok meaning="absent"/>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" role="UNKNOWN">p</XMTok>
                          <XMDual>
                            <XMRef idref="A1.Ex3.m3.1"/>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="A1.Ex3.m3.1">
                                <XMTok meaning="conditional" name="mid" role="MODIFIEROP">∣</XMTok>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMTok font="italic" role="UNKNOWN">o</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMDual>
                                  <XMApp>
                                    <XMTok meaning="list"/>
                                    <XMRef idref="A1.Ex3.m3.1.1"/>
                                    <XMRef idref="A1.Ex3.m3.1.2"/>
                                  </XMApp>
                                  <XMWrap>
                                    <XMApp xml:id="A1.Ex3.m3.1.1">
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                      <XMTok font="italic" role="UNKNOWN">h</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                    <XMTok role="PUNCT">,</XMTok>
                                    <XMApp xml:id="A1.Ex3.m3.1.2">
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                      <XMTok font="italic" role="UNKNOWN">s</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                  </XMWrap>
                                </XMDual>
                              </XMApp>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
        </equationgroup>
      </para>
      <para xml:id="A1.SS2.p3">
        <p>The models were implemented in PyTorch <cite class="ltx_citemacro_citep">(<bibref bibrefs="paszke2019pytorch" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. For all models, the size of the stochastic hidden dimension <Math mode="inline" tex="s_{t}" text="s _ t" xml:id="A1.SS2.p3.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> was kept at 20, while the size of the deterministic hidden dimension <Math mode="inline" tex="h_{t}" text="h _ t" xml:id="A1.SS2.p3.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">h</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> was set to 200, as in previous implementations of the RSSM <cite class="ltx_citemacro_citep">(<bibref bibrefs="hafner2019learning,saxena2021clockwork" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>.</p>
      </para>
      <para xml:id="A1.SS2.p4">
        <p>We used the encoder and decoder from <cite class="ltx_citemacro_citet"><bibref bibrefs="dittadi2020transfer" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>. The encoder consists of three blocks. The first block consists of a convolutional layer with a kernel of size 5 and a stride of 2 and a padding of 2, followed by a leaky ReLU activation function, followed by two residual blocks. The second block consists of a convolutional layer with a kernel of size 1 and a stride of 1 and no padding, followed by average pooling with a kernel of size 2, followed by two blocks residual blocks. The third block consists of average pooling with a kernel of size 2, followed by two blocks residual blocks. The fourth block consists of a convolutional layer with a kernel of size 1 and a stride of 1 and no padding, followed by average pooling with a kernel of size 2, followed by two blocks residual blocks. The fifth block consists of average pooling with a kernel of size 2, followed by two blocks residual blocks.</p>
      </para>
      <para xml:id="A1.SS2.p5">
        <p>The decoder consists of five blocks. The first block consists of two residual blocks, followed by upsampling with a scale factor of 2. The second block consists of two residual blocks, followed by a deconvolutional layer with a kernel size of 1 and a stride of 1, followed by upsampling with a scale factor of 2. The third block again consists of two residual blocks, followed by upsampling with a scale factor of 2. The fourth block consists of two residual blocks, followed by a deconvolutional layer with a kernel size of 1 and a stride of 1, followed by upsampling with a scale factor of 2. The fifth block consists of two residual blocks, followed by upsampling with a scale factor of 2, a leaky ReLU activation funktion, followed by a deconvolutional layer with a kernel size of 5 and a stride of 1 and a padding of 2.</p>
      </para>
      <para xml:id="A1.SS2.p6">
        <p>The models were trained for 200 epochs using a batch size of 32. The stimulus sets were randomly split into 99.000 training sequences and 1000 validation sequences. The loss function was optimized using the Adam optimiser with a learning rate of 0.001 <cite class="ltx_citemacro_citep">(<bibref bibrefs="kingma2014adam" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, which was divided by 10 every 50 epochs. The models were trained on a NVIDIA Quadro RTX 5000 for roughly 7 days. Our implementation of the RSSM borrows from a previous implementation on <ref class="ltx_href" href="https://github.com/cross32768/PlaNet_PyTorch">GitHub</ref>. The complete code for this project, including our model implementation, is available upon request.</p>
      </para>
      <pagination role="newpage"/>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:figures" xml:id="A1.SS3">
      <tags>
        <tag>A.3</tag>
        <tag role="autoref">subsection A.3</tag>
        <tag role="refnum">A.3</tag>
        <tag role="typerefnum">§A.3</tag>
      </tags>
      <title><tag close=" ">A.3</tag>Figures</title>
      <figure inlist="lof" placement="!h" xml:id="A1.F4">
        <tags>
          <tag>Figure 4</tag>
          <tag role="autoref">Figure 4</tag>
          <tag role="refnum">4</tag>
          <tag role="typerefnum">Figure 4</tag>
        </tags>
        <graphics candidates="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_0_rm_3_open.pdf" class="ltx_centering" graphic="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_0_rm_3_open.pdf" options="width=433.62pt" xml:id="A1.F4.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">4</tag>The first row shows the surprise for the expected and violated test sequences of the contact or no contact rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</toccaption>
        <caption class="ltx_centering"><tag close=": ">Figure 4</tag>The first row shows the surprise for the expected and violated test sequences of the contact or no contact rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</caption>
      </figure>
      <figure inlist="lof" placement="!t" xml:id="A1.F5">
        <tags>
          <tag>Figure 5</tag>
          <tag role="autoref">Figure 5</tag>
          <tag role="refnum">5</tag>
          <tag role="typerefnum">Figure 5</tag>
        </tags>
<!--  %****␣svrhm_2022.tex␣Line␣300␣**** -->        <graphics candidates="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_1_rm_3_open.pdf" class="ltx_centering" graphic="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_1_rm_3_open.pdf" options="width=433.62pt" xml:id="A1.F5.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">5</tag>The first row shows the surprise for the expected and violated test sequences of the type of contact rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</toccaption>
        <caption class="ltx_centering"><tag close=": ">Figure 5</tag>The first row shows the surprise for the expected and violated test sequences of the type of contact rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</caption>
      </figure>
      <figure inlist="lof" placement="!t" xml:id="A1.F6">
        <tags>
          <tag>Figure 6</tag>
          <tag role="autoref">Figure 6</tag>
          <tag role="refnum">6</tag>
          <tag role="typerefnum">Figure 6</tag>
        </tags>
        <graphics candidates="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_3_rm_3_open.pdf" class="ltx_centering" graphic="figures/RSSMAlt_Cubes_seed_17_beta_1.0_state_20_hidden_200_nll_False_kloverframes_cond_3_rm_3_open.pdf" options="width=433.62pt" xml:id="A1.F6.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">6</tag>The first row shows the surprise for the expected and violated test sequences of the shape rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</toccaption>
        <caption class="ltx_centering"><tag close=": ">Figure 6</tag>The first row shows the surprise for the expected and violated test sequences of the shape rule. The second row shows the expected test sequence. The third row shows the violated test sequence. The last row shows the open-loop predictions from the model, given the first two frames of the violated test sequence. It is important to note that the first three frames were removed for this plot, as the uncertainty is very high when the model is first given the sequence.</caption>
      </figure>
    </subsection>
  </appendix>
</document>
