<?xml version="1.0" encoding="UTF-8"?>
<?latexml searchpaths="/home/japhy/scienceReplication.artiswrong.com/paper_files/arxiv/2007.12348/latex_extracted"?>
<?latexml class="article"?>
<?latexml package="iclr2021_conference,times"?>
<?latexml package="inputenc" options="utf8"?>
<?latexml package="fontenc" options="T1"?>
<!--  %basic --><?latexml package="color,xcolor"?>
<?latexml package="epsfig"?>
<?latexml package="graphicx"?>
<!--  %figure␣and␣table --><?latexml package="adjustbox"?>
<?latexml package="array"?>
<?latexml package="booktabs"?>
<?latexml package="colortbl"?>
<?latexml package="float,wrapfig"?>
<?latexml package="hhline"?>
<?latexml package="multirow"?>
<?latexml package="subcaption"?>
<?latexml package="caption" options="font=small"?>
<!--  %font␣and␣character --><?latexml package="amsmath,amsfonts,amsthm,amssymb"?>
<?latexml package="bm"?>
<?latexml package="nicefrac"?>
<?latexml package="microtype"?>
<?latexml package="mathtools"?>
<!--  %layout --><!--  %****␣packages.tex␣Line␣25␣**** --><?latexml package="changepage"?>
<?latexml package="extramarks"?>
<?latexml package="fancyhdr"?>
<?latexml package="lastpage"?>
<?latexml package="setspace"?>
<?latexml package="soul"?>
<?latexml package="xspace"?>
<!--  %ref --><?latexml package="hyperref" options="pagebackref=true,breaklinks=true,colorlinks,citecolor=gray"?>
<?latexml package="url"?>
<?latexml package="algorithm, algorithmic"?>
<?latexml package="todonotes"?>
<?latexml package="enumitem"?>
<?latexml package="titlesec"?>
<?latexml package="bbm"?>
<!--  %require␣xspace,␣array --><!--  %%␣layout --><!--  %%␣notations --><!--  %datasets --><!--  %****␣macros.tex␣Line␣25␣**** --><!--  %****␣macros.tex␣Line␣50␣**** --><!--  %%␣comments --><!--  %****␣macros.tex␣Line␣75␣**** --><!--  %\newcommand{\mybf}[1]{\vspace{4pt}\noindent\textbf{#1}} --><!--  %\setlength{\belowcaptionskip}{-3pt} --><!--  %\addtolength{\parskip}{-0.8mm} --><!--  %%%%%␣NEW␣MATH␣DEFINITIONS␣%%%%% --><?latexml package="amsmath,amsfonts,bm"?>
<!--  %Mark␣sections␣of␣captions␣for␣referring␣to␣divisions␣of␣figures --><!--  %Highlight␣a␣newly␣defined␣term --><!--  %Figure␣reference,␣lower-case. --><!--  %Figure␣reference,␣capital.␣For␣start␣of␣sentence --><!--  %****␣math_commands.tex␣Line␣25␣**** --><!--  %Section␣reference,␣lower-case. --><!--  %Section␣reference,␣capital. --><!--  %Reference␣to␣two␣sections. --><!--  %Reference␣to␣three␣sections. --><!--  %Reference␣to␣an␣equation,␣lower-case. --><!--  %Reference␣to␣an␣equation,␣upper␣case --><!--  %A␣raw␣reference␣to␣an␣equation__avoid␣using␣if␣possible --><!--  %Reference␣to␣a␣chapter,␣lower-case. --><!--  %Reference␣to␣an␣equation,␣upper␣case. --><!--  %Reference␣to␣a␣range␣of␣chapters --><!--  %Reference␣to␣an␣algorithm,␣lower-case. --><!--  %Reference␣to␣an␣algorithm,␣upper␣case. --><!--  %****␣math_commands.tex␣Line␣50␣**** --><!--  %Reference␣to␣a␣part,␣lower␣case --><!--  %Reference␣to␣a␣part,␣upper␣case --><!--  %Random␣variables --><!--  %****␣math_commands.tex␣Line␣75␣**** --><!--  %rm␣is␣already␣a␣command,␣just␣don’t␣name␣any␣random␣variables␣m --><!--  %Random␣vectors --><!--  %****␣math_commands.tex␣Line␣100␣**** --><!--  %****␣math_commands.tex␣Line␣125␣**** --><!--  %Elements␣of␣random␣vectors --><!--  %****␣math_commands.tex␣Line␣150␣**** --><!--  %Random␣matrices --><!--  %****␣math_commands.tex␣Line␣175␣**** --><!--  %Elements␣of␣random␣matrices --><!--  %****␣math_commands.tex␣Line␣200␣**** --><!--  %Vectors --><!--  %****␣math_commands.tex␣Line␣225␣**** --><!--  %Elements␣of␣vectors --><!--  %****␣math_commands.tex␣Line␣250␣**** --><!--  %****␣math_commands.tex␣Line␣275␣**** --><!--  %Matrix --><!--  %****␣math_commands.tex␣Line␣300␣**** --><!--  %Tensor --><!--  %****␣math_commands.tex␣Line␣325␣**** --><!--  %Graph --><!--  %****␣math_commands.tex␣Line␣350␣**** --><!--  %Sets --><!--  %****␣math_commands.tex␣Line␣375␣**** --><!--  %Don’t␣use␣a␣set␣called␣E,␣because␣this␣would␣be␣the␣same␣as␣our␣symbol --><!--  %for␣expectation. --><!--  %****␣math_commands.tex␣Line␣400␣**** --><!--  %Entries␣of␣a␣matrix --><!--  %****␣math_commands.tex␣Line␣425␣**** --><!--  %entries␣of␣a␣tensor --><!--  %Same␣font␣as␣tensor,␣without␣\bm␣wrapper --><!--  %****␣math_commands.tex␣Line␣450␣**** --><!--  %The␣true␣underlying␣data␣generating␣distribution --><!--  %The␣empirical␣distribution␣defined␣by␣the␣training␣set --><!--  %The␣model␣distribution --><!--  %Stochastic␣autoencoder␣distributions --><!--  %****␣math_commands.tex␣Line␣475␣**** --><!--  %Laplace␣distribution --><!--  %Wolfram␣Mathworld␣says␣$L^2$␣is␣for␣function␣spaces␣and␣$\ell^2$␣is␣for␣vectors --><!--  %But␣then␣they␣seem␣to␣use␣$L^2$␣for␣vectors␣throughout␣the␣site,␣and␣so␣does --><!--  %wikipedia. --><!--  %****␣math_commands.tex␣Line␣500␣**** --><!--  %See␣usage␣in␣notation.tex.␣Chosen␣to␣match␣Daphne’s␣book. --><!--  %\DeclareMathOperator*{\argmax}{arg\,max} --><!--  %\DeclareMathOperator*{\argmin}{arg\,min} --><!--  %\usepackage{hyperref} --><!--  %\usepackage{url} --><?latexml RelaxNGSchema="LaTeXML"?>
<document xmlns="http://dlmf.nist.gov/LaTeXML" class="ltx_authors_1line">
  <resource src="LaTeXML.css" type="text/css"/>
  <resource src="ltx-article.css" type="text/css"/>
  <title>Unsupervised Discovery of<break/>3D Physical Objects from Video</title>
  <creator role="author">
    <personname>Yilun Du<break/>MIT<break/>&amp;Kevin Smith <break/>MIT<break/>&amp;Tomer Ulman <break/>Harvard University <break/>&amp;Joshua Tenenbaum <break/>MIT <break/>&amp;Jiajun Wu <break/>Stanford University <break/></personname>
  </creator>
  <abstract name="Abstract">
    <p>We study the problem of unsupervised physical object discovery. While existing frameworks aim to decompose scenes into 2D segments based off each object’s appearance, we explore how physics, especially object interactions, facilitates disentangling of 3D geometry and position of objects from video, in an unsupervised manner. Drawing inspiration from developmental psychology, our Physical Object Discovery Network (POD-Net) uses both multi-scale pixel cues and physical motion cues to accurately segment observable and partially occluded objects of varying sizes, and infer properties of those objects. Our model reliably segments objects on both synthetic and real scenes. The discovered object properties can also be used to reason about physical events.<note role="footnotetext" xml:id="footnotex1">Project page: https://yilundu.github.io/podnet</note>
<!--  %It␣can␣also␣use␣the␣inferred␣3D␣geometry␣to␣reason␣about␣physically␣implausible␣scenes.␣%␣␣\jw{last␣sentence␣too␣strong.} --></p>
  </abstract>
  <ERROR class="undefined">\iclrfinalcopy</ERROR>
<!--  %Authors␣must␣not␣appear␣in␣the␣submitted␣version.␣They␣should␣be␣hidden 
     %as␣long␣as␣the␣\iclrfinalcopy␣macro␣remains␣commented␣out␣below.
     %Non-anonymous␣submissions␣will␣be␣rejected␣without␣review.
     %****␣iclr2021_conference.tex␣Line␣25␣****
     %The␣\author␣macro␣works␣with␣any␣number␣of␣authors.␣There␣are␣two␣commands
     %used␣to␣separate␣the␣names␣and␣addresses␣of␣multiple␣authors:␣\And␣and␣\AND.
     %Using␣\And␣between␣authors␣leaves␣it␣to␣\LaTeX{}␣to␣determine␣where␣to␣break
     %the␣lines.␣Using␣\AND␣forces␣a␣linebreak␣at␣that␣point.␣So,␣if␣\LaTeX{}
     %puts␣3␣of␣4␣authors␣names␣on␣the␣first␣line,␣and␣the␣last␣on␣the␣second
     %line,␣try␣using␣\AND␣instead␣of␣\And␣before␣the␣third␣author␣name.
     %\iclrfinalcopy␣%␣Uncomment␣for␣camera-ready␣version,␣but␣NOT␣for␣submission.
     %List␣of␣different␣experiments␣to␣run
     %****␣iclr2021_conference.tex␣Line␣50␣****
     %\begin{enumerate}
     %\item␣Comparison␣with␣the␣unsupervised␣video␣object␣detection␣approaches␣__␣showcase␣that␣its␣difficult␣for␣them␣to␣consistently␣track␣objects␣through␣time
     %\item␣Full␣and␣increased␣focus␣on␣the␣evaluation␣on␣the␣adept␣baseline␣__␣in␣particular␣showing␣what␣things␣our␣models␣are␣good␣at␣and␣what␣using␣a␣unsupervised␣video␣object␣detection␣model␣is␣bad␣at
     %\item␣Analysis␣of␣how␣this␣compares␣to␣the␣performance␣of␣that␣of␣humans
     %\end{enumerate}
     %%%%%%%%%␣BODY␣TEXT-->  <section inlist="toc" xml:id="S1">
    <tags>
      <tag>1</tag>
      <tag role="autoref">section 1</tag>
      <tag role="refnum">1</tag>
      <tag role="typerefnum">§1</tag>
    </tags>
    <title><tag close=" ">1</tag>Introduction</title>
    <para xml:id="S1.p1">
      <p>From early in development, infants impose structure on their world. When they look at a scene, infants do not perceive simply an array of colors. Instead, they scan the scene and organize the world into objects that obey certain physical expectations, like traveling along smooth paths or not winking in and out of existence <cite class="ltx_citemacro_citep">(<bibref bibrefs="Spelke2007Core,Spelke1992Origins" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Here we take two ideas from human, and particularly infant, perception for helping artificial agents learn about object properties: that coherent object motion constrains expectations about future object states, and that foveation patterns allow people to scan both small or far-away and large or close-up objects in the same scene.</p>
    </para>
    <para xml:id="S1.p2">
      <p>Motion is particularly crucial in the early ability to segment a scene into individual objects. For instance, infants perceive two patches moving together as a single object, even though they look perceptually distinct to adults <cite class="ltx_citemacro_citep">(<bibref bibrefs="Kellman1983Perception" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. This segmentation from motion even leads young children to expect that if a toy resting on a block is picked up, both the block and the toy will move up as if they are a single object. <!--  %(unless␣the␣child␣saw␣the␣toy␣and␣the␣block␣moving␣separately␣earlier,␣and␣see␣\cite{spelke1993gestalt}). 
     %This␣segmentation-from-motion␣is␣not␣a␣uniquely␣human␣trait:␣week-old␣chicks␣will␣treat␣an␣image␣patch␣as␣an␣object␣that␣it␣can␣imprint␣on␣if␣it␣moves␣in␣spatially␣contiguous␣ways,␣but␣will␣not␣if␣that␣same␣patch␣teleports␣\citep{prasad2019using}.-->This suggests that artificial systems that learn to segment the world could be usefully constrained by the principle that there are objects that move in regular ways.</p>
    </para>
    <para xml:id="S1.p3">
      <p>In addition, human vision exhibits foveation patterns, where only a local patch of a scene is often visible at once. This allows people to focus on objects that are otherwise small on the retina, but also stitch together different glimpses of larger objects into a coherent whole.</p>
    </para>
    <para xml:id="S1.p4">
      <p>We propose the Physical Object Discovery Network (POD-Net), a self-supervised model that learns to extract object-based scene representations from videos using motion cues. POD-Net links a visual generative model with a dynamics model in which objects persist and move smoothly. The visual generative model factors an object-based scene decompositions across local patches, then aggregates those local patches into a global segmentation. The link between the visual model and the dynamics model constrains the discovered representations to be usable to predict future world states. POD-Net thus produces more stable image segmentations than other self-supervised segmentation models, especially in challenging conditions such as when objects occlude each other (Figure <ref labelref="LABEL:fig:teaser"/>).</p>
    </para>
    <para xml:id="S1.p5">
      <p>We test how well POD-Net performs image segmentation and object discovery on two datasets: one made from ShapeNet objects <cite class="ltx_citemacro_citep">(<bibref bibrefs="Chang2015Shapenet" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, and one from real-world images. We find that POD-Net outperforms recent self-supervised image segmentation models that use regular foreground-background relationships <cite class="ltx_citemacro_citep">(<bibref bibrefs="greff2019multiobject" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> or assume that images are composable into object-like parts <cite class="ltx_citemacro_citep">(<bibref bibrefs="burgess2019monet" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Finally, we show that the representations learned by POD-Net can be used to support reasoning in a task that requires identifying scenes with physically implausible events <cite class="ltx_citemacro_citep">(<bibref bibrefs="smith2019modeling" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Together, this demonstrates that using motion as a grouping cue to constrain the learning of object segmentations and representations achieves both goals: it produces better image segmentations and learns scene representations that are useful for physical reasoning.</p>
    </para>
    <figure inlist="lof" labels="LABEL:fig:teaser" placement="t" xml:id="S1.F1">
      <tags>
        <tag><text fontsize="90%">Figure 1</text></tag>
        <tag role="autoref">Figure 1</tag>
        <tag role="refnum">1</tag>
        <tag role="typerefnum">Figure 1</tag>
      </tags>
      <graphics candidates="fig/teaser_motion.pdf" class="ltx_centering" graphic="fig/teaser_motion.pdf" options="width=368.577pt" xml:id="S1.F1.g1"/>
      <toccaption class="ltx_centering"><tag close=" ">1</tag><text fontsize="90%">Motion is an important cue for object segmentation from early in development. We combine motion with an approximate understanding of physics to discover 3D objects that are physically consistent across time. In the video above, motion cues (shown with colored arrows) enable our model to modify our predictions from a single large incorrect segmentation mask to two smaller correct masks.</text></toccaption>
      <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 1</text></tag><text fontsize="90%">Motion is an important cue for object segmentation from early in development. We combine motion with an approximate understanding of physics to discover 3D objects that are physically consistent across time. In the video above, motion cues (shown with colored arrows) enable our model to modify our predictions from a single large incorrect segmentation mask to two smaller correct masks.</text></caption>
<!--  %\end{wrapfigure} -->    </figure>
<!--  %Classically,␣image␣segmentation␣algorithms␣have␣relied␣on␣regularities␣in␣the␣pixels␣of␣the␣images␣themselves,␣to␣mixed␣success␣[CITATIONS].␣Other␣algorithms␣have␣attempted␣to␣segment␣pixels␣using␣motion␣masks,␣but␣do␣not␣capture␣the␣long-term␣regularities␣that␣objects␣persist␣in␣the␣world␣[CITATIONS]. 
     %More␣recent␣work␣has␣attempted␣to␣learn␣to␣perform␣object␣segmentation␣via␣self-supervision,␣by␣using␣inductive␣biases␣that␣there␣are␣regular␣foreground-background␣relationships␣\citep{greff2019multiobject},␣or␣that␣images␣are␣composable␣into␣object-like␣parts␣\citep{burgess2019monet},␣but␣these␣systems␣work␣on␣single␣images␣and␣therefore␣cannot␣use␣motion␣as␣a␣grouping␣cue.
     %Here␣we␣consider␣a␣system␣inspired␣by␣infant␣cognition␣that␣combines␣the␣best␣of␣both␣of␣these␣approaches.␣We␣propose␣the␣Physical␣Object␣Discovery␣Network␣(POD-Net)␣that␣learns␣to␣discover␣objects␣from␣videos.␣Crucially,␣POD-Net␣links␣a␣visual␣generative␣model␣with␣a␣dynamics␣model␣where␣(i)␣objects␣persist,␣and␣(ii)␣those␣objects␣move␣smoothly.␣This␣allows␣POD-Net␣to␣train␣in␣a␣self-supervised␣fashion␣by␣extracting␣object␣properties␣that␣must␣be␣consistent␣with␣the␣expected␣world␣state␣from␣the␣dynamics␣model.␣POD-Net␣learns␣without␣training␣data,␣and␣learns␣to␣segment␣sets␣of␣objects␣that␣occlude␣each␣other,␣or␣are␣very␣close␣by.␣We␣present␣results␣on␣both␣a␣synthetic␣and␣a␣realistic␣data␣set,␣and␣find␣that␣POD-Net␣outperforms␣models␣that␣do␣not␣use␣physical␣motion␣as␣a␣training␣signal.␣Furthermore,␣this␣self-supervised␣system␣can␣be␣used␣to␣extract␣reliable␣enough␣object␣information␣to␣be␣used␣in␣a␣task␣that␣requires␣reasoning␣about␣physical␣plausibility.
     %****␣intro.tex␣Line␣25␣****
     %\begin{itemize}
     %\item␣Amazing␣things␣that␣infants␣can␣do␣from␣the␣earliest␣ages;␣core␣knowledge␣of␣physics␣\cite{Spelke2007Core};␣motion␣is␣super␣important␣\cite{Kellman1983Perception};␣contiguous␣motion␣even␣important␣for␣animals␣(chicks)␣\cite{prasad2019using}
     %\item␣Recent␣approaches␣that␣account␣for␣parts␣of␣this␣core␣knowledge␣do␣okay:␣MONet␣\cite{burgess2019monet},␣IODINE␣\cite{greff2019multiobject}
     %\item␣We’re␣inspired␣by␣the␣full␣stack␣of␣infant␣core␣knowledge;␣use␣motion␣cues␣but␣fed␣into␣consistent␣world␣model␣populated␣by␣objects;␣benefits␣of␣this␣approach
     %\item␣Preview␣experiments␣and␣results
     %\end{itemize}-->  </section>
  <section inlist="toc" xml:id="S2">
    <tags>
      <tag>2</tag>
      <tag role="autoref">section 2</tag>
      <tag role="refnum">2</tag>
      <tag role="typerefnum">§2</tag>
    </tags>
    <title><tag close=" ">2</tag>Related Work</title>
    <para xml:id="S2.p1">
      <p>Developing a factorized scene representation has been a core research topic in computer vision for decades. Most learning-based prior works are supervised, requiring annotated specifications such as segmentations <cite class="ltx_citemacro_citep">(<bibref bibrefs="janner2018reasoning" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, patches <cite class="ltx_citemacro_citep">(<bibref bibrefs="fragkiadaki2015learning" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, or simulation engines <cite class="ltx_citemacro_citep">(<bibref bibrefs="wu2017learning,kansky2017schema" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. These supervised approaches face two challenges. First, in practical scenarios, annotations are often prohibitively challenging to obtain: we cannot annotate the 3D geometry, pose, and semantics of every object we encounter, especially for deformable objects such as trees. Second, supervised methods may not generalize well to out-of-distribution test data such as novel objects or scenes.</p>
    </para>
    <para xml:id="S2.p2">
      <p>Recent research on unsupervised object discovery and segmentation in machine learning has attempted to address these issues: researchers have developed deep nets and inference algorithms that learn to ground visual entities with factorized generative models of static <cite class="ltx_citemacro_citep">(<bibref bibrefs="greff2017neural,burgess2019monet,greff2019multiobject,Eslami2016Attend" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and dynamic <cite class="ltx_citemacro_citep">(<bibref bibrefs="van2018relational,veerapaneni2019entity,kosiorek2018sequential,eslami2018neural" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> scenes. Some approaches also learn to model the relations and interactions between objects <cite class="ltx_citemacro_citep">(<bibref bibrefs="veerapaneni2019entity,stanic2019r,van2018relational" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. The progress in the field is impressive, though these approaches are still mostly restricted to low-resolution images and perform less well on small or heavily occluded objects. Because of this, they often fail to observe key concepts such as object permanence and solidity. Furthermore, these models all segment objects in 2D, while our POD-Net aims to capture the 3D geometry of objects in the scene.</p>
    </para>
    <para xml:id="S2.p3">
      <p>Some recent papers have integrated deep learning with differentiable rendering to reconstruct 3D shapes from visual data without supervision, although they mostly focused on images of a single object <cite class="ltx_citemacro_citep">(<bibref bibrefs="Rezende2016Unsupervised,sitzmann2019scene" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, or require multiview data as input <cite class="ltx_citemacro_citep">(<bibref bibrefs="Yan2016Perspective" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. In contrast, we use object motion and physics to discover objects in 3D with physical occupancy. This allows our model to do better in both object discovery and future prediction, captures notions such as object permanence, and better aligns with people’s perception, belief, and surprise signals of dynamic scenes. A separate body of work utilizes motion cues to segment objects <cite class="ltx_citemacro_citep">(<bibref bibrefs="brox_malik_2010,bideau_roychowdhury_menon_learned-miller_2018,xie_xiang_harchaoui_fox_2019,dave_tokmakov_ramanan_2019" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Such works typically assume a single foreground object moving, and aggregate motion information across frames to segment out objects or separate moving parts of objects. Our work instead seeks to distill information captured from motion to discover objects in 3D from images.</p>
    </para>
    <para xml:id="S2.p4">
      <p>Others works have explored 3D object discovery using RGB-D or 3D volumetric inputs <cite class="ltx_citemacro_citep">(<bibref bibrefs="herbst_henry_ren_fox_2011,karpathy_miller_fei-fei_2013,ma_sibley_2014" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. The presence of 3D information, such as depth, is a significant difference from our work. Such information allows approaches to reliably detect surface orientations and discontinuities <cite class="ltx_citemacro_citep">(<bibref bibrefs="karpathy_miller_fei-fei_2013,herbst_henry_ren_fox_2011" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> which significantly reduces the difficulty of discovering objects, especially in the tabletop settings considered.</p>
    </para>
    <para xml:id="S2.p5">
      <p>Our work is also related to research in computer vision on unsupervised object discovery from video <cite class="ltx_citemacro_citep">(<bibref bibrefs="lu2019see,wang2019zero,yang2019anchor" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Such works focus on detecting objects in realistic videos, but only focus on detecting a single foreground object, instead of all component objects in a scene. Furthermore, such approaches rely
on supervised information such as ImageNet weights, or pretrained segmentation networks for object detection, limiting their applicability to new objects in novel classes. Our our approach also assumes supervised information on the underlying 2D to 3D mapping, but it does not assume any supervision for object detection. We show that this enables our approach to generalize to novel ShapeNet objects.</p>
    </para>
<!--  %\vspace{-10pt} 
     %\jw{I␣wanted␣to␣talk␣about␣some␣computer␣vision␣work␣but␣realized␣it␣might␣not␣be␣interesting␣to␣ICML␣people.␣Should␣we␣still␣do␣that?}␣%Our␣work␣is␣also␣tangentially␣relevant␣to␣research␣in␣computer␣vision␣on␣unsupervised␣object␣discovery␣from␣image␣collections,␣or␣unsupervised␣video␣segmentation.␣Those␣papers
     %\yl{That␣seems␣fine␣to␣me}-->  </section>
  <section inlist="toc" xml:id="S3">
    <tags>
      <tag>3</tag>
      <tag role="autoref">section 3</tag>
      <tag role="refnum">3</tag>
      <tag role="typerefnum">§3</tag>
    </tags>
    <title><tag close=" ">3</tag>Method</title>
<!--  %\vspace{-5pt} 
     %Our␣model:
     %Iterative␣Object␣Decomposition␣Inference␣NEtwork␣(IODINE)
     %Multi-Object␣network␣(MONet)
     %Multi-Scale␣physics␣object␣decomposition␣network
     %Spatially␣Invariant␣Attend,␣Infer,␣Repeat
     %Physical␣object␣Multi-scale
     %Multi-Scale␣Physical␣Object␣decompon
     %PMSNet
     %Generative␣Physical␣Multi-Scale␣Object␣Network
     %GPMONet
     %Physical␣Object␣Network
     %The␣Physical␣Object␣Discovery␣Network␣(\model)␣decomposes␣a␣dynamic␣scene␣(represented␣as␣a␣video␣of␣images)␣into␣a␣set␣of␣component␣3D␣physical␣primitives.␣\model␣consists␣of␣a␣generative␣model␣over␣3D␣primitive␣movement,␣represented␣as␣a␣set␣of␣physical␣rules,␣as␣well␣as␣a␣VAE␣\citep{Kingma2014Auto,␣Rezende2014Stochastic}␣that␣represents␣the␣back-projection␣of␣primitives␣onto␣2D␣images␣(\sect{sect:generative}).␣\model␣further␣consists␣of␣a␣inference␣model,␣which␣recursively␣infers␣a␣set␣component␣primitive␣descriptions,␣masks␣and␣latents.␣Both␣generation␣and␣inference␣networks␣are␣trained␣through␣variational␣inference␣(\sect{sect:inference}).-->    <figure inlist="lof" labels="LABEL:fig:overview" placement="t" xml:id="S3.F2">
      <tags>
        <tag><text fontsize="90%">Figure 2</text></tag>
        <tag role="autoref">Figure 2</tag>
        <tag role="refnum">2</tag>
        <tag role="typerefnum">Figure 2</tag>
      </tags>
      <graphics candidates="fig/pipeline_motion.pdf" class="ltx_centering" graphic="fig/pipeline_motion.pdf" options="width=433.62pt" xml:id="S3.F2.g1"/>
      <toccaption class="ltx_centering"><tag close=" ">2</tag><text fontsize="90%">POD-Net contains four modules for discovering physical objects from video. (I) An inference model auto-regressively infers a set of candidate object masks and latents to describe each patch of an image; (II) A backprojection model maps each mask to a 3D primitive; (III) A dynamics model captures the motion of 3D physical objects; and (IV) An image generative model decodes latents and masks to reconstruct the image.
</text></toccaption>
      <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 2</text></tag><text fontsize="90%">POD-Net contains four modules for discovering physical objects from video. (I) An inference model auto-regressively infers a set of candidate object masks and latents to describe each patch of an image; (II) A backprojection model maps each mask to a 3D primitive; (III) A dynamics model captures the motion of 3D physical objects; and (IV) An image generative model decodes latents and masks to reconstruct the image.
</text></caption>
    </figure>
    <para xml:id="S3.p1">
      <p>The Physical Object Discovery Network (POD-Net) (Figure <ref labelref="LABEL:fig:overview"/>) decomposes a dynamic scene into a set of component 3D physical primitives. POD-Net contains an inference model, which recursively infers a set of component primitive descriptions, masks, and latent vectors (Section <ref labelref="LABEL:sect:inference"/>). It also contains a three-module generative model (Section <ref labelref="LABEL:sect:generative"/>). The generative model consists of a back-projection module to infer 3D properties of each component, a dynamics model to predict primitives motions and a image generative model in the form a VAE <cite class="ltx_citemacro_citep">(<bibref bibrefs="Kingma2014Auto,Rezende2014Stochastic" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> to render primitives onto 2D images. These components ensure that learned primitive representations can reconstruct the original image in a physically consistent manner. Together, these constraints produce a strong signal for self-supervised learning of object-centric scene representations.</p>
    </para>
<!--  %\vspace{-5pt} -->    <subsection inlist="toc" labels="LABEL:sect:inference" xml:id="S3.SS1">
      <tags>
        <tag>3.1</tag>
        <tag role="autoref">subsection 3.1</tag>
        <tag role="refnum">3.1</tag>
        <tag role="typerefnum">§3.1</tag>
      </tags>
      <title><tag close=" ">3.1</tag>Inference Model</title>
<!--  %****␣method.tex␣Line␣25␣**** -->      <para xml:id="S3.SS1.p1">
        <p>We sequentially infer the underlying masks and latents that represent a scene (Figure <ref labelref="LABEL:fig:overview"/>-I). Inspired by MONet <cite class="ltx_citemacro_citep">(<bibref bibrefs="burgess2019monet" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, we employ an attention network <Math mode="inline" tex="\text{Attention}(\cdot)" text="[Attention] * cdot" xml:id="S3.SS1.p1.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMText>Attention</XMText>
                <XMDual>
                  <XMRef idref="S3.SS1.p1.m1.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S3.SS1.p1.m1.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math> to decompose a scene into a set of separate masks <Math mode="inline" tex="\bm{M}=\{{\bm{m}}_{1},{\bm{m}}_{2},...,{\bm{m}}_{n}\}" text="M = set@(m _ 1, m _ 2, ldots, m _ n)" xml:id="S3.SS1.p1.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMTok font="bold italic" role="UNKNOWN">M</XMTok>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="set"/>
                    <XMRef idref="S3.SS1.p1.m2.2"/>
                    <XMRef idref="S3.SS1.p1.m2.3"/>
                    <XMRef idref="S3.SS1.p1.m2.1"/>
                    <XMRef idref="S3.SS1.p1.m2.4"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">{</XMTok>
                    <XMApp xml:id="S3.SS1.p1.m2.2">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS1.p1.m2.3">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                      <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMTok name="ldots" role="ID" xml:id="S3.SS1.p1.m2.1">…</XMTok>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS1.p1.m2.4">
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">n</XMTok>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">}</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>. We extract a corresponding latent <Math mode="inline" tex="{\bm{z}}_{i}" text="z _ i" xml:id="S3.SS1.p1.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math> per mask <Math mode="inline" tex="{\bm{m}}_{i}" text="m _ i" xml:id="S3.SS1.p1.m4">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math>. In particular, we initialize context <Math mode="inline" tex="{\bm{c}}_{0}=\textbf{1}" text="c _ 0 = [1]" xml:id="S3.SS1.p1.m5">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">c</XMTok>
                  <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                </XMApp>
                <XMText><text font="bold">1</text></XMText>
              </XMApp>
            </XMath>
          </Math>, which we define to represent the context in the image <Math mode="inline" tex="{\bm{x}}" text="x" xml:id="S3.SS1.p1.m6">
            <XMath>
              <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
            </XMath>
          </Math> yet to be explained. At each step, we decode the attention mask <Math mode="inline" tex="{\bm{m}}_{i}={\bm{c}}_{i-1}\text{Attention}({\bm{x}};{\bm{c}}_{i-1})" text="m _ i = c _ (i - 1) * [Attention] * list@(x, c _ (i - 1))" xml:id="S3.SS1.p1.m7">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">c</XMTok>
                    <XMApp>
                      <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMText>Attention</XMText>
                  <XMDual>
                    <XMApp>
                      <XMTok meaning="list"/>
                      <XMRef idref="S3.SS1.p1.m7.1"/>
                      <XMRef idref="S3.SS1.p1.m7.2"/>
                    </XMApp>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMTok font="bold italic" role="UNKNOWN" xml:id="S3.SS1.p1.m7.1">x</XMTok>
                      <XMTok role="PUNCT">;</XMTok>
                      <XMApp xml:id="S3.SS1.p1.m7.2">
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">c</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>. We iteratively update the corresponding context in the image by <Math mode="inline" tex="{\bm{c}}_{i}={\bm{c}}_{i-1}\left(1-\text{Attention}({\bm{x}};{\bm{c}}_{i-1})\right)" text="c _ i = c _ (i - 1) * (1 - [Attention] * list@(x, c _ (i - 1)))" xml:id="S3.SS1.p1.m8">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">c</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">c</XMTok>
                    <XMApp>
                      <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMDual>
                    <XMRef idref="S3.SS1.p1.m8.2"/>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="true">(</XMTok>
                      <XMApp xml:id="S3.SS1.p1.m8.2">
                        <XMTok meaning="minus" role="ADDOP">-</XMTok>
                        <XMTok meaning="1" role="NUMBER">1</XMTok>
                        <XMApp>
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMText>Attention</XMText>
                          <XMDual>
                            <XMApp>
                              <XMTok meaning="list"/>
                              <XMRef idref="S3.SS1.p1.m8.1"/>
                              <XMRef idref="S3.SS1.p1.m8.2.1"/>
                            </XMApp>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN" xml:id="S3.SS1.p1.m8.1">x</XMTok>
                              <XMTok role="PUNCT">;</XMTok>
                              <XMApp xml:id="S3.SS1.p1.m8.2.1">
                                <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                <XMTok font="bold italic" role="UNKNOWN">c</XMTok>
                                <XMApp>
                                  <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                      </XMApp>
                      <XMTok role="CLOSE" stretchy="true">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMApp>
            </XMath>
          </Math> to ensure that sum of all masks explain the entire image.</p>
      </para>
      <para xml:id="S3.SS1.p2">
        <p>We further train a VAE encoder <Math mode="inline" tex="\text{Encode}(z|m,{\bm{x}})" text="[Encode] * conditional@(z, list@(m, x))" xml:id="S3.SS1.p2.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMText>Encode</XMText>
                <XMDual>
                  <XMRef idref="S3.SS1.p2.m1.3"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S3.SS1.p2.m1.3">
                      <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                      <XMDual>
                        <XMApp>
                          <XMTok meaning="list"/>
                          <XMRef idref="S3.SS1.p2.m1.1"/>
                          <XMRef idref="S3.SS1.p2.m1.2"/>
                        </XMApp>
                        <XMWrap>
                          <XMTok font="italic" role="UNKNOWN" xml:id="S3.SS1.p2.m1.1">m</XMTok>
                          <XMTok role="PUNCT">,</XMTok>
                          <XMTok font="bold italic" role="UNKNOWN" xml:id="S3.SS1.p2.m1.2">x</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math>, which infers latents <Math mode="inline" tex="{\bm{z}}_{i}" text="z _ i" xml:id="S3.SS1.p2.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math> from each component mask <Math mode="inline" tex="{\bm{m}}_{i}" text="m _ i" xml:id="S3.SS1.p2.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
              </XMApp>
            </XMath>
          </Math>. We set <Math mode="inline" tex="{\bm{m}}_{0},{\bm{z}}_{0}" text="list@(m _ 0, z _ 0)" xml:id="S3.SS1.p2.m4">
            <XMath>
              <XMDual>
                <XMApp>
                  <XMTok meaning="list"/>
                  <XMRef idref="S3.SS1.p2.m4.1"/>
                  <XMRef idref="S3.SS1.p2.m4.2"/>
                </XMApp>
                <XMWrap>
                  <XMApp xml:id="S3.SS1.p2.m4.1">
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                    <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                  <XMApp xml:id="S3.SS1.p2.m4.2">
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                    <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                  </XMApp>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math> – the first decoded mask and latent – to be the background mask <Math mode="inline" tex="{\bm{m}}_{b}" text="m _ b" xml:id="S3.SS1.p2.m5">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
              </XMApp>
            </XMath>
          </Math> and latent <Math mode="inline" tex="{\bm{z}}_{b}" text="z _ b" xml:id="S3.SS1.p2.m6">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
              </XMApp>
            </XMath>
          </Math>, and define each subsequent mask or latent to be object masks and latents.</p>
      </para>
<!--  %\vspace{-5pt} -->      <paragraph inlist="toc" xml:id="S3.SS1.SSS0.Px1">
        <title>Sub-patch decomposition.</title>
        <para xml:id="S3.SS1.SSS0.Px1.p1">
          <p>Direct inference of component objects and background from a single image can be difficult, especially when images are complex and when objects are of vastly different sizes. An inference network must learn to pay attention to coarse features in order to segment large objects, and to fine details in the same image order to segment the smaller objects. Inspired by how people solve this problem by stitching together multiple foveations into a coherent whole, we train our models and apply inference on overlapping sub-patches of an image (Figure <ref labelref="LABEL:fig:mask_merge"/>).</p>
        </para>
        <figure float="right" inlist="lof" labels="LABEL:fig:mask_merge" xml:id="S3.F3">
          <tags>
            <tag><text fontsize="90%">Figure 3</text></tag>
            <tag role="autoref">Figure 3</tag>
            <tag role="refnum">3</tag>
            <tag role="typerefnum">Figure 3</tag>
          </tags>
          <graphics candidates="fig/mask_merge.pdf" graphic="fig/mask_merge.pdf" options="width=433.62pt" xml:id="S3.F3.g1"/>
          <toccaption><tag close=" ">3</tag>
<text fontsize="90%">Illustration of sub-patch decomposition for image inference. An image is divided in a 8<Math mode="inline" tex="\times" text="*" xml:id="S3.F3.m1">
                <XMath>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                </XMath>
              </Math>8 grid, with inference is applied to each 2<Math mode="inline" tex="\times" text="*" xml:id="S3.F3.m2">
                <XMath>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                </XMath>
              </Math>2 sub-grid. To generate a global segmentation mask, object masks are sequentially inferred for each subpatch. Each object mask is either matched to an existing object or used to create a new object.
</text></toccaption>
          <caption><tag close=": "><text fontsize="90%">Figure 3</text></tag><text fontsize="90%">
Illustration of sub-patch decomposition for image inference. An image is divided in a 8<Math mode="inline" tex="\times" text="*" xml:id="S3.F3.m3">
                <XMath>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                </XMath>
              </Math>8 grid, with inference is applied to each 2<Math mode="inline" tex="\times" text="*" xml:id="S3.F3.m4">
                <XMath>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                </XMath>
              </Math>2 sub-grid. To generate a global segmentation mask, object masks are sequentially inferred for each subpatch. Each object mask is either matched to an existing object or used to create a new object.
</text></caption>
        </figure>
        <para xml:id="S3.SS1.SSS0.Px1.p2">
          <p>In particular, given an image of size <Math mode="inline" tex="H\times W" text="H * W" xml:id="S3.SS1.SSS0.Px1.p2.m1">
              <XMath>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                  <XMTok font="italic" role="UNKNOWN">H</XMTok>
                  <XMTok font="italic" role="UNKNOWN">W</XMTok>
                </XMApp>
              </XMath>
            </Math>, we divide the image into a <Math mode="inline" tex="8\times 8" text="8 * 8" xml:id="S3.SS1.SSS0.Px1.p2.m2">
              <XMath>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                  <XMTok meaning="8" role="NUMBER">8</XMTok>
                  <XMTok meaning="8" role="NUMBER">8</XMTok>
                </XMApp>
              </XMath>
            </Math> grid (pictured in the left of Figure <ref labelref="LABEL:fig:mask_merge"/>), with each grid element having size <Math mode="inline" tex="H/8\times W/8" text="((H / 8) * W) / 8" xml:id="S3.SS1.SSS0.Px1.p2.m3">
              <XMath>
                <XMApp>
                  <XMTok meaning="divide" role="MULOP">/</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">×</XMTok>
                    <XMApp>
                      <XMTok meaning="divide" role="MULOP">/</XMTok>
                      <XMTok font="italic" role="UNKNOWN">H</XMTok>
                      <XMTok meaning="8" role="NUMBER">8</XMTok>
                    </XMApp>
                    <XMTok font="italic" role="UNKNOWN">W</XMTok>
                  </XMApp>
                  <XMTok meaning="8" role="NUMBER">8</XMTok>
                </XMApp>
              </XMath>
            </Math>. We construct a sub-patch for every <Math mode="inline" tex="2\times 2" text="2 * 2" xml:id="S3.SS1.SSS0.Px1.p2.m4">
              <XMath>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">×</XMTok>
                  <XMTok meaning="2" role="NUMBER">2</XMTok>
                  <XMTok meaning="2" role="NUMBER">2</XMTok>
                </XMApp>
              </XMath>
            </Math> component sub-grid, leading to a total of 64 different overlapping sub-patches. We apply inference on each sub-patch. Under this decomposition, smaller objects still appear large in each sub-patch, while larger objects are shared across sub-patch.</p>
        </para>
        <para xml:id="S3.SS1.SSS0.Px1.p3">
          <p>To obtain a global segmentation map, we merge each sub-patch sequentially using a sliding window (Figure <ref labelref="LABEL:fig:mask_merge"/>). At each step, we iterate through each segment given by the inference model from a sub-patch, and merge it with segments obtained from previous sub-patches, if there is an overlap in masks above 20 pixels. Every segment that does not get merged is initialized as a new object. <!--  %We␣train␣inference␣and␣generative␣models␣on␣each␣sub-patch␣of␣an␣image␣independently.␣\jw{maybe␣refer␣to␣Fig␣3␣after␣revision.} --></p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sect:generative" xml:id="S3.SS2">
      <tags>
        <tag>3.2</tag>
        <tag role="autoref">subsection 3.2</tag>
        <tag role="refnum">3.2</tag>
        <tag role="typerefnum">§3.2</tag>
      </tags>
      <title><tag close=" ">3.2</tag>Generative Model</title>
      <para xml:id="S3.SS2.p1">
        <p>Our generative model represents a dynamic scene as a set of <Math mode="inline" tex="K" text="K" xml:id="S3.SS2.p1.m1">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">K</XMTok>
            </XMath>
          </Math> different physical objects and the surrounding background at each time step <Math mode="inline" tex="t" text="t" xml:id="S3.SS2.p1.m2">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">t</XMTok>
            </XMath>
          </Math>. Each physical object <Math mode="inline" tex="k" text="k" xml:id="S3.SS2.p1.m3">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">k</XMTok>
            </XMath>
          </Math> is represented by its back-projection on 2D, a segmentation mask <Math mode="inline" tex="{\bm{m}}_{k}^{t}\in{\mathbb{R}}^{HxW}" text="(m _ k) ^ t element-of R ^ (H * x * W)" xml:id="S3.SS2.p1.m4">
            <XMath>
              <XMApp>
                <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">H</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">x</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">W</XMTok>
                  </XMApp>
                </XMApp>
              </XMApp>
            </XMath>
          </Math> of height <Math mode="inline" tex="H" text="H" xml:id="S3.SS2.p1.m5">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">H</XMTok>
            </XMath>
          </Math> and width <Math mode="inline" tex="W" text="W" xml:id="S3.SS2.p1.m6">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">W</XMTok>
            </XMath>
          </Math>, and a latent code <Math mode="inline" tex="{\bm{z}}_{k}^{t}\in{\mathbb{R}}^{D}" text="(z _ k) ^ t element-of R ^ D" xml:id="S3.SS2.p1.m7">
            <XMath>
              <XMApp>
                <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">D</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math> of dimension <Math mode="inline" tex="D" text="D" xml:id="S3.SS2.p1.m8">
            <XMath>
              <XMTok font="italic" role="UNKNOWN">D</XMTok>
            </XMath>
          </Math> for its appearance. In addition, the background is captured as a surrounding segmentation mask <Math mode="inline" tex="{\bm{m}}_{b}^{t}\in{\mathbb{R}}^{HxW}" text="(m _ b) ^ t element-of R ^ (H * x * W)" xml:id="S3.SS2.p1.m9">
            <XMath>
              <XMApp>
                <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                  </XMApp>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">H</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">x</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">W</XMTok>
                  </XMApp>
                </XMApp>
              </XMApp>
            </XMath>
          </Math> and code <Math mode="inline" tex="{\bm{z}}_{b}^{t}\in{\mathbb{R}}^{D}" text="(z _ b) ^ t element-of R ^ D" xml:id="S3.SS2.p1.m10">
            <XMath>
              <XMApp>
                <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                  </XMApp>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">D</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>. Segmentation masks are defined such that the sum of all masks corresponds to the entire image <Math mode="inline" tex="\sum_{k}{\bm{m}}_{k}^{t}+{\bm{m}}_{b}^{t}=1" text="(sum _ k)@((m _ k) ^ t) + (m _ b) ^ t = 1" xml:id="S3.SS2.p1.m11">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok meaning="plus" role="ADDOP">+</XMTok>
                  <XMApp>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok mathstyle="text" meaning="sum" role="SUMOP" scriptpos="post">∑</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                    </XMApp>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                    </XMApp>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                  </XMApp>
                </XMApp>
                <XMTok meaning="1" role="NUMBER">1</XMTok>
              </XMApp>
            </XMath>
          </Math>.</p>
      </para>
      <para xml:id="S3.SS2.p2">
        <p>We use a backprojection model to map segmentation masks <Math mode="inline" tex="{\bm{m}}_{k}^{t}" text="(m _ k) ^ t" xml:id="S3.SS2.p2.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> to 3D primitive cuboids (Figure <ref labelref="LABEL:fig:overview"/>-II). Cuboids are a coarse geometric representation that enable physical simulation. We next construct a dynamics model over the physical movement of predicted primitives (Figure <ref labelref="LABEL:fig:overview"/>-III). We further construct a generative model over images <Math mode="inline" tex="{\bm{x}}^{t}" text="x ^ t" xml:id="S3.SS2.p2.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> by decoding latents <Math mode="inline" tex="{\bm{z}}^{t}" text="z ^ t" xml:id="S3.SS2.p2.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
              </XMApp>
            </XMath>
          </Math> component-wise (Figure <ref labelref="LABEL:fig:overview"/>-IV).</p>
      </para>
<!--  %\jw{Let’s␣also␣connect␣the␣two␣paragraphs␣above␣to␣the␣model␣figure␣(Fig␣2)} -->      <paragraph inlist="toc" xml:id="S3.SS2.SSS0.Px1">
        <title>Backprojection model.</title>
<!--  %****␣method.tex␣Line␣50␣**** -->        <para xml:id="S3.SS2.SSS0.Px1.p1">
          <p>Our backprojection model maps a mask <Math mode="inline" tex="{\bm{m}}_{k}" text="m _ k" xml:id="S3.SS2.SSS0.Px1.p1.m1">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
              </XMath>
            </Math> to an underlying 3D primitive cuboid, represented as a translation <Math mode="inline" tex="{\bm{t}}_{k}\in{\mathbb{R}}^{3}" text="t _ k element-of R ^ 3" xml:id="S3.SS2.SSS0.Px1.p1.m2">
              <XMath>
                <XMApp>
                  <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                    <XMTok fontsize="70%" meaning="3" role="NUMBER">3</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>, size <Math mode="inline" tex="{\bm{s}}_{k}\in{\mathbb{R}}^{3}" text="s _ k element-of R ^ 3" xml:id="S3.SS2.SSS0.Px1.p1.m3">
              <XMath>
                <XMApp>
                  <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                    <XMTok fontsize="70%" meaning="3" role="NUMBER">3</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>, and rotation <Math mode="inline" tex="{\bm{q}}_{k}\in{\mathbb{R}}^{3}" text="q _ k element-of R ^ 3" xml:id="S3.SS2.SSS0.Px1.p1.m4">
              <XMath>
                <XMApp>
                  <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                    <XMTok fontsize="70%" meaning="3" role="NUMBER">3</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math> (as a Euler angle) transform on a unit cuboid in a fully differentiable manner. In our case, we primarily pre-train a neural network to approximate the 2D-to-3D projection and use it as our differentiable backprojection model. However we show that such a task can also be approximated by assuming the camera parameters and the height of the plane is given (ignoring size and rotation regression), with little reduction in performance (see Appendix <ref labelref="LABEL:app:details"/> for further details). <!--  %knowledge␣of␣the␣underlying␣structure␣of␣a␣scene␣(such␣as␣if␣the␣scene␣is␣represented␣as␣a␣plane). 
     %Details␣are␣in␣the␣appendix.␣%can␣be␣found␣in␣the␣supplementary␣material.␣%␣To␣implement␣this␣operation␣differentiability,␣␣we␣use␣a␣pre-trained␣model,␣though␣we␣note␣that␣a␣differentiable␣back-projection␣module␣(from␣2D␣to␣3D)␣\citep{zhang2018learning}␣could␣also␣be␣used.--></p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S3.SS2.SSS0.Px2">
        <title>Dynamics model.</title>
        <para xml:id="S3.SS2.SSS0.Px2.p1">
          <p>We construct a dynamics model over the next state of different physical objects <Math mode="inline" tex="({\bm{t}}_{k}^{t},{\bm{s}}_{k}^{t},{\bm{q}}_{k}^{t},{\bm{m}}_{k}^{t})" text="vector@((t _ k) ^ t, (s _ k) ^ t, (q _ k) ^ t, (m _ k) ^ t)" xml:id="S3.SS2.SSS0.Px2.p1.m1">
              <XMath>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="vector"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p1.m1.1"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p1.m1.2"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p1.m1.3"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p1.m1.4"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p1.m1.1">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p1.m1.2">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p1.m1.3">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p1.m1.4">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math> by using first order approximation of velocity/angular velocity of the states of the object. Specifically, our model predicts</p>
          <equationgroup class="ltx_eqn_align" xml:id="A1.EGx1">
            <equation xml:id="S3.E1">
              <tags>
                <tag>(1)</tag>
                <tag role="autoref">Equation 1</tag>
                <tag role="refnum">1</tag>
              </tags>
              <MathFork>
                <Math tex="\displaystyle\hat{{\bm{t}}}_{k}^{t}={\bm{t}}_{k}^{t-1}+\frac{1}{t-1}\sum_{i=1}%&#10;^{t-1}({\bm{t}}_{k}^{i}-{\bm{t}}_{k}^{i-1}),\quad\hat{{\bm{s}}}_{k}^{t}=\frac{%&#10;1}{t}\sum_{i=0}^{t-1}{\bm{s}}_{k}^{i}," text="formulae@(((hat@(t)) _ k) ^ t = (t _ k) ^ (t - 1) + (1 / (t - 1)) * ((sum _ (i = 1)) ^ (t - 1))@((t _ k) ^ i - (t _ k) ^ (i - 1)), ((hat@(s)) _ k) ^ t = (1 / t) * ((sum _ (i = 0)) ^ (t - 1))@((s _ k) ^ i))" xml:id="S3.E1.m3">
                  <XMath>
                    <XMDual>
                      <XMRef idref="S3.E1.m3.1"/>
                      <XMWrap>
                        <XMDual xml:id="S3.E1.m3.1">
                          <XMApp>
                            <XMTok meaning="formulae"/>
                            <XMRef idref="S3.E1.m3.1.1"/>
                            <XMRef idref="S3.E1.m3.1.2"/>
                          </XMApp>
                          <XMWrap>
                            <XMApp xml:id="S3.E1.m3.1.1">
                              <XMTok meaning="equals" role="RELOP">=</XMTok>
                              <XMApp>
                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                    <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                              <XMApp>
                                <XMTok meaning="plus" role="ADDOP">+</XMTok>
                                <XMApp>
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMApp>
                                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                  <XMApp>
                                    <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                    <XMTok meaning="1" role="NUMBER">1</XMTok>
                                    <XMApp>
                                      <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                      <XMTok font="italic" role="UNKNOWN">t</XMTok>
                                      <XMTok meaning="1" role="NUMBER">1</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMApp>
                                    <XMApp scriptpos="mid">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                                      <XMApp scriptpos="mid">
                                        <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                        <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                        <XMApp>
                                          <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMApp>
                                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMDual>
                                      <XMRef idref="S3.E1.m3.1.1.1"/>
                                      <XMWrap>
                                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                                        <XMApp xml:id="S3.E1.m3.1.1.1">
                                          <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                          <XMApp>
                                            <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                            </XMApp>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                          </XMApp>
                                          <XMApp>
                                            <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                            </XMApp>
                                            <XMApp>
                                              <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                            </XMApp>
                                          </XMApp>
                                        </XMApp>
                                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                      </XMWrap>
                                    </XMDual>
                                  </XMApp>
                                </XMApp>
                              </XMApp>
                            </XMApp>
                            <XMTok role="PUNCT" rpadding="10.0pt">,</XMTok>
                            <XMApp xml:id="S3.E1.m3.1.2">
                              <XMTok meaning="equals" role="RELOP">=</XMTok>
                              <XMApp>
                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                    <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                              <XMApp>
                                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                <XMApp>
                                  <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                  <XMTok meaning="1" role="NUMBER">1</XMTok>
                                  <XMTok font="italic" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMApp>
                                  <XMApp scriptpos="mid">
                                    <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                                    <XMApp scriptpos="mid">
                                      <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                      <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                      <XMApp>
                                        <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                        <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                      <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                    <XMApp>
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                      <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                    </XMApp>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                  </XMApp>
                                </XMApp>
                              </XMApp>
                            </XMApp>
                          </XMWrap>
                        </XMDual>
                        <XMTok role="PUNCT">,</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMath>
                </Math>
                <MathBranch>
                  <td align="right"><Math mode="inline" tex="\displaystyle\hat{{\bm{t}}}_{k}^{t}" text="((hat@(t)) _ k) ^ t" xml:id="S3.E1.m1">
                      <XMath>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                      </XMath>
                    </Math></td>
                  <td align="left"><Math mode="inline" tex="\displaystyle={\bm{t}}_{k}^{t-1}+\frac{1}{t-1}\sum_{i=1}^{t-1}({\bm{t}}_{k}^{i%&#10;}-{\bm{t}}_{k}^{i-1}),\quad\hat{{\bm{s}}}_{k}^{t}=\frac{1}{t}\sum_{i=0}^{t-1}{%&#10;\bm{s}}_{k}^{i}," text="formulae@(absent = (t _ k) ^ (t - 1) + (1 / (t - 1)) * ((sum _ (i = 1)) ^ (t - 1))@((t _ k) ^ i - (t _ k) ^ (i - 1)), ((hat@(s)) _ k) ^ t = (1 / t) * ((sum _ (i = 0)) ^ (t - 1))@((s _ k) ^ i))" xml:id="S3.E1.m2">
                      <XMath>
                        <XMDual>
                          <XMRef idref="S3.E1.m2.1"/>
                          <XMWrap>
                            <XMDual xml:id="S3.E1.m2.1">
                              <XMApp>
                                <XMTok meaning="formulae"/>
                                <XMRef idref="S3.E1.m2.1.1"/>
                                <XMRef idref="S3.E1.m2.1.2"/>
                              </XMApp>
                              <XMWrap>
                                <XMApp xml:id="S3.E1.m2.1.1">
                                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                                  <XMTok meaning="absent"/>
                                  <XMApp>
                                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                                    <XMApp>
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                        <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      </XMApp>
                                      <XMApp>
                                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                      <XMApp>
                                        <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                        <XMTok meaning="1" role="NUMBER">1</XMTok>
                                        <XMApp>
                                          <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                          <XMTok font="italic" role="UNKNOWN">t</XMTok>
                                          <XMTok meaning="1" role="NUMBER">1</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMApp>
                                        <XMApp scriptpos="mid">
                                          <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                                          <XMApp scriptpos="mid">
                                            <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                            <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                            <XMApp>
                                              <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                            </XMApp>
                                          </XMApp>
                                          <XMApp>
                                            <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                          </XMApp>
                                        </XMApp>
                                        <XMDual>
                                          <XMRef idref="S3.E1.m2.1.1.1"/>
                                          <XMWrap>
                                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                                            <XMApp xml:id="S3.E1.m2.1.1.1">
                                              <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                              <XMApp>
                                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                                <XMApp>
                                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                                  <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                                </XMApp>
                                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              </XMApp>
                                              <XMApp>
                                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                                <XMApp>
                                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                                  <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                                </XMApp>
                                                <XMApp>
                                                  <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                                </XMApp>
                                              </XMApp>
                                            </XMApp>
                                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                          </XMWrap>
                                        </XMDual>
                                      </XMApp>
                                    </XMApp>
                                  </XMApp>
                                </XMApp>
                                <XMTok role="PUNCT" rpadding="10.0pt">,</XMTok>
                                <XMApp xml:id="S3.E1.m2.1.2">
                                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                                  <XMApp>
                                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                    <XMApp>
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                        <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                      </XMApp>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                    </XMApp>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                    <XMApp>
                                      <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                      <XMTok meaning="1" role="NUMBER">1</XMTok>
                                      <XMTok font="italic" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                    <XMApp>
                                      <XMApp scriptpos="mid">
                                        <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                                        <XMApp scriptpos="mid">
                                          <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                          <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                          <XMApp>
                                            <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                            <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                                          </XMApp>
                                        </XMApp>
                                        <XMApp>
                                          <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMApp>
                                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                        <XMApp>
                                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                          <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                        </XMApp>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                      </XMApp>
                                    </XMApp>
                                  </XMApp>
                                </XMApp>
                              </XMWrap>
                            </XMDual>
                            <XMTok role="PUNCT">,</XMTok>
                          </XMWrap>
                        </XMDual>
                      </XMath>
                    </Math></td>
                </MathBranch>
              </MathFork>
            </equation>
            <equation xml:id="S3.E2">
              <tags>
                <tag>(2)</tag>
                <tag role="autoref">Equation 2</tag>
                <tag role="refnum">2</tag>
              </tags>
              <MathFork>
                <Math tex="\displaystyle\hat{{\bm{q}}}_{k}^{t}={\bm{q}}_{k}^{t-1}+\frac{1}{t-1}\sum_{i=1}%&#10;^{t-1}({\bm{q}}_{k}^{i}-{\bm{q}}_{k}^{i-1}),\quad\hat{{\bm{m}}}_{k}^{t}=\text{%&#10;Render}(\hat{{\bm{t}}}_{k}^{t},\hat{{\bm{s}}}_{k}^{t},\hat{{\bm{q}}}_{k}^{t})." text="formulae@(((hat@(q)) _ k) ^ t = (q _ k) ^ (t - 1) + (1 / (t - 1)) * ((sum _ (i = 1)) ^ (t - 1))@((q _ k) ^ i - (q _ k) ^ (i - 1)), ((hat@(m)) _ k) ^ t = [Render] * vector@(((hat@(t)) _ k) ^ t, ((hat@(s)) _ k) ^ t, ((hat@(q)) _ k) ^ t))" xml:id="S3.E2.m3">
                  <XMath>
                    <XMDual>
                      <XMRef idref="S3.E2.m3.1"/>
                      <XMWrap>
                        <XMDual xml:id="S3.E2.m3.1">
                          <XMApp>
                            <XMTok meaning="formulae"/>
                            <XMRef idref="S3.E2.m3.1.1"/>
                            <XMRef idref="S3.E2.m3.1.2"/>
                          </XMApp>
                          <XMWrap>
                            <XMApp xml:id="S3.E2.m3.1.1">
                              <XMTok meaning="equals" role="RELOP">=</XMTok>
                              <XMApp>
                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                    <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                              <XMApp>
                                <XMTok meaning="plus" role="ADDOP">+</XMTok>
                                <XMApp>
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMApp>
                                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                  <XMApp>
                                    <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                    <XMTok meaning="1" role="NUMBER">1</XMTok>
                                    <XMApp>
                                      <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                      <XMTok font="italic" role="UNKNOWN">t</XMTok>
                                      <XMTok meaning="1" role="NUMBER">1</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMApp>
                                    <XMApp scriptpos="mid">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                                      <XMApp scriptpos="mid">
                                        <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                        <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                        <XMApp>
                                          <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMApp>
                                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMDual>
                                      <XMRef idref="S3.E2.m3.1.1.1"/>
                                      <XMWrap>
                                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                                        <XMApp xml:id="S3.E2.m3.1.1.1">
                                          <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                          <XMApp>
                                            <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                            </XMApp>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                          </XMApp>
                                          <XMApp>
                                            <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                            </XMApp>
                                            <XMApp>
                                              <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                            </XMApp>
                                          </XMApp>
                                        </XMApp>
                                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                      </XMWrap>
                                    </XMDual>
                                  </XMApp>
                                </XMApp>
                              </XMApp>
                            </XMApp>
                            <XMTok role="PUNCT" rpadding="10.0pt">,</XMTok>
                            <XMApp xml:id="S3.E2.m3.1.2">
                              <XMTok meaning="equals" role="RELOP">=</XMTok>
                              <XMApp>
                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                    <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                              <XMApp>
                                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                <XMText>Render</XMText>
                                <XMDual>
                                  <XMApp>
                                    <XMTok meaning="vector"/>
                                    <XMRef idref="S3.E2.m3.1.2.1"/>
                                    <XMRef idref="S3.E2.m3.1.2.2"/>
                                    <XMRef idref="S3.E2.m3.1.2.3"/>
                                  </XMApp>
                                  <XMWrap>
                                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                                    <XMApp xml:id="S3.E2.m3.1.2.1">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                        <XMApp>
                                          <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                          <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                        </XMApp>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      </XMApp>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                    <XMTok role="PUNCT">,</XMTok>
                                    <XMApp xml:id="S3.E2.m3.1.2.2">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                        <XMApp>
                                          <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                          <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                        </XMApp>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      </XMApp>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                    <XMTok role="PUNCT">,</XMTok>
                                    <XMApp xml:id="S3.E2.m3.1.2.3">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                        <XMApp>
                                          <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                          <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                        </XMApp>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      </XMApp>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                  </XMWrap>
                                </XMDual>
                              </XMApp>
                            </XMApp>
                          </XMWrap>
                        </XMDual>
                        <XMTok role="PERIOD">.</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMath>
                </Math>
                <MathBranch>
                  <td align="right"><Math mode="inline" tex="\displaystyle\hat{{\bm{q}}}_{k}^{t}" text="((hat@(q)) _ k) ^ t" xml:id="S3.E2.m1">
                      <XMath>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                      </XMath>
                    </Math></td>
                  <td align="left"><Math mode="inline" tex="\displaystyle={\bm{q}}_{k}^{t-1}+\frac{1}{t-1}\sum_{i=1}^{t-1}({\bm{q}}_{k}^{i%&#10;}-{\bm{q}}_{k}^{i-1}),\quad\hat{{\bm{m}}}_{k}^{t}=\text{Render}(\hat{{\bm{t}}}%&#10;_{k}^{t},\hat{{\bm{s}}}_{k}^{t},\hat{{\bm{q}}}_{k}^{t})." text="formulae@(absent = (q _ k) ^ (t - 1) + (1 / (t - 1)) * ((sum _ (i = 1)) ^ (t - 1))@((q _ k) ^ i - (q _ k) ^ (i - 1)), ((hat@(m)) _ k) ^ t = [Render] * vector@(((hat@(t)) _ k) ^ t, ((hat@(s)) _ k) ^ t, ((hat@(q)) _ k) ^ t))" xml:id="S3.E2.m2">
                      <XMath>
                        <XMDual>
                          <XMRef idref="S3.E2.m2.1"/>
                          <XMWrap>
                            <XMDual xml:id="S3.E2.m2.1">
                              <XMApp>
                                <XMTok meaning="formulae"/>
                                <XMRef idref="S3.E2.m2.1.1"/>
                                <XMRef idref="S3.E2.m2.1.2"/>
                              </XMApp>
                              <XMWrap>
                                <XMApp xml:id="S3.E2.m2.1.1">
                                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                                  <XMTok meaning="absent"/>
                                  <XMApp>
                                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                                    <XMApp>
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                        <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      </XMApp>
                                      <XMApp>
                                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                      <XMApp>
                                        <XMTok mathstyle="display" meaning="divide" role="FRACOP"/>
                                        <XMTok meaning="1" role="NUMBER">1</XMTok>
                                        <XMApp>
                                          <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                          <XMTok font="italic" role="UNKNOWN">t</XMTok>
                                          <XMTok meaning="1" role="NUMBER">1</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMApp>
                                        <XMApp scriptpos="mid">
                                          <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                                          <XMApp scriptpos="mid">
                                            <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                                            <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                            <XMApp>
                                              <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                            </XMApp>
                                          </XMApp>
                                          <XMApp>
                                            <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                          </XMApp>
                                        </XMApp>
                                        <XMDual>
                                          <XMRef idref="S3.E2.m2.1.1.1"/>
                                          <XMWrap>
                                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                                            <XMApp xml:id="S3.E2.m2.1.1.1">
                                              <XMTok meaning="minus" role="ADDOP">-</XMTok>
                                              <XMApp>
                                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                                <XMApp>
                                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                                  <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                                </XMApp>
                                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              </XMApp>
                                              <XMApp>
                                                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                                <XMApp>
                                                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                                  <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                                </XMApp>
                                                <XMApp>
                                                  <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                                </XMApp>
                                              </XMApp>
                                            </XMApp>
                                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                          </XMWrap>
                                        </XMDual>
                                      </XMApp>
                                    </XMApp>
                                  </XMApp>
                                </XMApp>
                                <XMTok role="PUNCT" rpadding="10.0pt">,</XMTok>
                                <XMApp xml:id="S3.E2.m2.1.2">
                                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                                  <XMApp>
                                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                    <XMApp>
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                      <XMApp>
                                        <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                        <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                      </XMApp>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                    </XMApp>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                    <XMText>Render</XMText>
                                    <XMDual>
                                      <XMApp>
                                        <XMTok meaning="vector"/>
                                        <XMRef idref="S3.E2.m2.1.2.1"/>
                                        <XMRef idref="S3.E2.m2.1.2.2"/>
                                        <XMRef idref="S3.E2.m2.1.2.3"/>
                                      </XMApp>
                                      <XMWrap>
                                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                                        <XMApp xml:id="S3.E2.m2.1.2.1">
                                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                          <XMApp>
                                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                            </XMApp>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                          </XMApp>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        </XMApp>
                                        <XMTok role="PUNCT">,</XMTok>
                                        <XMApp xml:id="S3.E2.m2.1.2.2">
                                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                          <XMApp>
                                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                              <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                            </XMApp>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                          </XMApp>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        </XMApp>
                                        <XMTok role="PUNCT">,</XMTok>
                                        <XMApp xml:id="S3.E2.m2.1.2.3">
                                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                          <XMApp>
                                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                            <XMApp>
                                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                            </XMApp>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                          </XMApp>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                        </XMApp>
                                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                      </XMWrap>
                                    </XMDual>
                                  </XMApp>
                                </XMApp>
                              </XMWrap>
                            </XMDual>
                            <XMTok role="PERIOD">.</XMTok>
                          </XMWrap>
                        </XMDual>
                      </XMath>
                    </Math></td>
                </MathBranch>
              </MathFork>
            </equation>
          </equationgroup>
        </para>
        <para xml:id="S3.SS2.SSS0.Px2.p2">
          <p>Our <Math mode="inline" tex="\text{Render}(\cdot)" text="[Render] * cdot" xml:id="S3.SS2.SSS0.Px2.p2.m1">
              <XMath>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMText>Render</XMText>
                  <XMDual>
                    <XMRef idref="S3.SS2.SSS0.Px2.p2.m1.1"/>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMTok name="cdot" role="MULOP" xml:id="S3.SS2.SSS0.Px2.p2.m1.1">⋅</XMTok>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMath>
            </Math> function computes the segmentation mask of a predicted physical object, assuming all other physical objects are rendered. To compute this, we initialize a palette <Math mode="inline" tex="{\bm{p}}_{0}=\textbf{1}" text="p _ 0 = [1]" xml:id="S3.SS2.SSS0.Px2.p2.m2">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">p</XMTok>
                    <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                  </XMApp>
                  <XMText><text font="bold">1</text></XMText>
                </XMApp>
              </XMath>
            </Math>, which we define to represent the context in an image that has not been rendered yet. We further utilize a separate pre-trained model Project<Math mode="inline" tex="(\cdot)" text="cdot" xml:id="S3.SS2.SSS0.Px2.p2.m3">
              <XMath>
                <XMDual>
                  <XMRef idref="S3.SS2.SSS0.Px2.p2.m3.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMTok name="cdot" role="MULOP" xml:id="S3.SS2.SSS0.Px2.p2.m3.1">⋅</XMTok>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math> that projects each primitive in 3D to a 2D segmentation mask (the inverse of the backprojection model described above). We then reorder predicted physical objects in increasing distance from the camera to <Math mode="inline" tex="(\hat{{\bm{t}}}_{k^{\prime}}^{t},\hat{{\bm{s}}}_{k^{\prime}}^{t},\hat{{\bm{q}}%&#10;}_{k^{\prime}}^{t})" text="vector@(((hat@(t)) _ (k ^ prime)) ^ t, ((hat@(s)) _ (k ^ prime)) ^ t, ((hat@(q)) _ (k ^ prime)) ^ t)" xml:id="S3.SS2.SSS0.Px2.p2.m4">
              <XMath>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="vector"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p2.m4.1"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p2.m4.2"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p2.m4.3"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m4.1">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                          <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m4.2">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                          <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m4.3">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                          <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math>. We then sequentially render each predicted physical object using <Math mode="inline" tex="\text{Render}(\hat{{\bm{t}}}_{k^{\prime}}^{t},\hat{{\bm{s}}}_{k^{\prime}}^{t},%&#10;\hat{{\bm{q}}}_{k^{\prime}}^{t})={\bm{p}}_{k^{\prime}-1}\text{Project}(\hat{{%&#10;\bm{t}}}_{k^{\prime}}^{t},\hat{{\bm{s}}}_{k^{\prime}}^{t},\hat{{\bm{q}}}_{k^{%&#10;\prime}}^{t})" text="[Render] * vector@(((hat@(t)) _ (k ^ prime)) ^ t, ((hat@(s)) _ (k ^ prime)) ^ t, ((hat@(q)) _ (k ^ prime)) ^ t) = p _ (k ^ prime - 1) * [Project] * vector@(((hat@(t)) _ (k ^ prime)) ^ t, ((hat@(s)) _ (k ^ prime)) ^ t, ((hat@(q)) _ (k ^ prime)) ^ t)" xml:id="S3.SS2.SSS0.Px2.p2.m5">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMText>Render</XMText>
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="vector"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p2.m5.1"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p2.m5.2"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p2.m5.3"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m5.1">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m5.2">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m5.3">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="bold italic" role="UNKNOWN">p</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                        </XMApp>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMText>Project</XMText>
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="vector"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p2.m5.4"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p2.m5.5"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p2.m5.6"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m5.4">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m5.5">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m5.6">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                            </XMApp>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>, and update the corresponding palette to be rendered as <Math mode="inline" tex="{\bm{p}}_{k^{\prime}}={\bm{p}}_{k^{\prime}-1}(1-\text{Project}(\hat{{\bm{t}}}_%&#10;{k^{\prime}}^{t},\hat{{\bm{s}}}_{k^{\prime}}^{t},\hat{{\bm{q}}}_{k^{\prime}}^{%&#10;t}))" text="p _ (k ^ prime) = p _ (k ^ prime - 1) * (1 - [Project] * vector@(((hat@(t)) _ (k ^ prime)) ^ t, ((hat@(s)) _ (k ^ prime)) ^ t, ((hat@(q)) _ (k ^ prime)) ^ t))" xml:id="S3.SS2.SSS0.Px2.p2.m6">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">p</XMTok>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="bold italic" role="UNKNOWN">p</XMTok>
                      <XMApp>
                        <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                        </XMApp>
                        <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMDual>
                      <XMRef idref="S3.SS2.SSS0.Px2.p2.m6.1"/>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m6.1">
                          <XMTok meaning="minus" role="ADDOP">-</XMTok>
                          <XMTok meaning="1" role="NUMBER">1</XMTok>
                          <XMApp>
                            <XMTok meaning="times" role="MULOP">⁢</XMTok>
                            <XMText>Project</XMText>
                            <XMDual>
                              <XMApp>
                                <XMTok meaning="vector"/>
                                <XMRef idref="S3.SS2.SSS0.Px2.p2.m6.1.1"/>
                                <XMRef idref="S3.SS2.SSS0.Px2.p2.m6.1.2"/>
                                <XMRef idref="S3.SS2.SSS0.Px2.p2.m6.1.3"/>
                              </XMApp>
                              <XMWrap>
                                <XMTok role="OPEN" stretchy="false">(</XMTok>
                                <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m6.1.1">
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMApp>
                                      <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                      <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok role="PUNCT">,</XMTok>
                                <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m6.1.2">
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMApp>
                                      <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                      <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok role="PUNCT">,</XMTok>
                                <XMApp xml:id="S3.SS2.SSS0.Px2.p2.m6.1.3">
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                    <XMApp>
                                      <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                      <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                      <XMTok fontsize="50%" name="prime" role="SUPOP">′</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok role="CLOSE" stretchy="false">)</XMTok>
                              </XMWrap>
                            </XMDual>
                          </XMApp>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>.</p>
        </para>
        <para xml:id="S3.SS2.SSS0.Px2.p3">
          <p>Given modeled future states, the overall likelihood of a physical object <Math mode="inline" tex="({\bm{t}}_{k}^{t},{\bm{s}}_{k}^{t},{\bm{q}}_{k}^{t},{\bm{m}}_{k}^{t})" text="vector@((t _ k) ^ t, (s _ k) ^ t, (q _ k) ^ t, (m _ k) ^ t)" xml:id="S3.SS2.SSS0.Px2.p3.m1">
              <XMath>
                <XMDual>
                  <XMApp>
                    <XMTok meaning="vector"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p3.m1.1"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p3.m1.2"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p3.m1.3"/>
                    <XMRef idref="S3.SS2.SSS0.Px2.p3.m1.4"/>
                  </XMApp>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m1.1">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m1.2">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m1.3">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                    <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m1.4">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math> is given by</p>
          <equation labels="LABEL:eqn:physics" xml:id="S3.E3">
            <tags>
              <tag>(3)</tag>
              <tag role="autoref">Equation 3</tag>
              <tag role="refnum">3</tag>
            </tags>
            <Math mode="display" tex="p({\bm{t}}_{k}^{t},{\bm{s}}_{k}^{t},{\bm{q}}_{k}^{t},{\bm{m}}_{k}^{t})={%&#10;\mathcal{N}}({\bm{t}}_{k}^{t};\hat{{\bm{t}}}_{k}^{t},\sigma_{t}^{2}){\mathcal{%&#10;N}}({\bm{s}}_{k}^{t};\hat{{\bm{s}}}_{k}^{t},\sigma_{s}^{2}){\mathcal{N}}({\bm{%&#10;q}}_{k}^{t};\hat{{\bm{q}}}_{k}^{t},\sigma_{q}^{2})p({\bm{m}}_{k}^{t},\hat{{\bm%&#10;{m}}}_{k}^{t})," text="p * vector@((t _ k) ^ t, (s _ k) ^ t, (q _ k) ^ t, (m _ k) ^ t) = N * list@((t _ k) ^ t, ((hat@(t)) _ k) ^ t, (sigma _ t) ^ 2) * N * list@((s _ k) ^ t, ((hat@(s)) _ k) ^ t, (sigma _ s) ^ 2) * N * list@((q _ k) ^ t, ((hat@(q)) _ k) ^ t, (sigma _ q) ^ 2) * p * open-interval@((m _ k) ^ t, ((hat@(m)) _ k) ^ t)" xml:id="S3.E3.m1">
              <XMath>
                <XMDual>
                  <XMRef idref="S3.E3.m1.1"/>
                  <XMWrap>
                    <XMApp xml:id="S3.E3.m1.1">
                      <XMTok meaning="equals" role="RELOP">=</XMTok>
                      <XMApp>
                        <XMTok meaning="times" role="MULOP">⁢</XMTok>
                        <XMTok font="italic" role="UNKNOWN">p</XMTok>
                        <XMDual>
                          <XMApp>
                            <XMTok meaning="vector"/>
                            <XMRef idref="S3.E3.m1.1.1"/>
                            <XMRef idref="S3.E3.m1.1.2"/>
                            <XMRef idref="S3.E3.m1.1.3"/>
                            <XMRef idref="S3.E3.m1.1.4"/>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.1">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.2">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.3">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.4">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                      </XMApp>
                      <XMApp>
                        <XMTok meaning="times" role="MULOP">⁢</XMTok>
                        <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                        <XMDual>
                          <XMApp>
                            <XMTok meaning="list"/>
                            <XMRef idref="S3.E3.m1.1.5"/>
                            <XMRef idref="S3.E3.m1.1.6"/>
                            <XMRef idref="S3.E3.m1.1.7"/>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.5">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">;</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.6">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                  <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.7">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                              </XMApp>
                              <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                        <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                        <XMDual>
                          <XMApp>
                            <XMTok meaning="list"/>
                            <XMRef idref="S3.E3.m1.1.8"/>
                            <XMRef idref="S3.E3.m1.1.9"/>
                            <XMRef idref="S3.E3.m1.1.10"/>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.8">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">;</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.9">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                  <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.10">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                              </XMApp>
                              <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                        <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                        <XMDual>
                          <XMApp>
                            <XMTok meaning="list"/>
                            <XMRef idref="S3.E3.m1.1.11"/>
                            <XMRef idref="S3.E3.m1.1.12"/>
                            <XMRef idref="S3.E3.m1.1.13"/>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.11">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">;</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.12">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                  <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.13">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">q</XMTok>
                              </XMApp>
                              <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                        <XMTok font="italic" role="UNKNOWN">p</XMTok>
                        <XMDual>
                          <XMApp>
                            <XMTok meaning="open-interval"/>
                            <XMRef idref="S3.E3.m1.1.14"/>
                            <XMRef idref="S3.E3.m1.1.15"/>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.14">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp xml:id="S3.E3.m1.1.15">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                  <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                        </XMDual>
                      </XMApp>
                    </XMApp>
                    <XMTok role="PUNCT">,</XMTok>
                  </XMWrap>
                </XMDual>
              </XMath>
            </Math>
          </equation>
          <p>where we assume Gaussian distributions over translation, sizes, and rotations with <Math mode="inline" tex="\sigma_{s}=\sigma_{t}=\sigma_{q}=1" text="sigma _ s = sigma _ t = sigma _ q = 1" xml:id="S3.SS2.SSS0.Px2.p3.m2">
              <XMath>
                <XMApp>
                  <XMTok meaning="multirelation"/>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                  </XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                  </XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">q</XMTok>
                  </XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMTok meaning="1" role="NUMBER">1</XMTok>
                </XMApp>
              </XMath>
            </Math>. <Math mode="inline" tex="p(\cdot)" text="p * cdot" xml:id="S3.SS2.SSS0.Px2.p3.m3">
              <XMath>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMTok font="italic" role="UNKNOWN">p</XMTok>
                  <XMDual>
                    <XMRef idref="S3.SS2.SSS0.Px2.p3.m3.1"/>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMTok name="cdot" role="MULOP" xml:id="S3.SS2.SSS0.Px2.p3.m3.1">⋅</XMTok>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMath>
            </Math> is the probability of a predicted mask, defined as <Math mode="inline" tex="p(\hat{{\bm{m}}}_{k}^{t},{\bm{m}}_{k}^{t})=\mathbbm{1}_{{\bm{m}}_{k}^{t}&gt;0.5}%&#10;\hat{{\bm{m}}}_{k}^{t}+(1-\mathbbm{1}_{{\bm{m}}_{k}^{t}&gt;0.5})(1-\hat{{\bm{m}}}%&#10;_{k}^{t})" text="p * open-interval@(((hat@(m)) _ k) ^ t, (m _ k) ^ t) = 1 _ ((m _ k) ^ t &gt; 0.5) * ((hat@(m)) _ k) ^ t + (1 - 1 _ ((m _ k) ^ t &gt; 0.5)) * (1 - ((hat@(m)) _ k) ^ t)" xml:id="S3.SS2.SSS0.Px2.p3.m4">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" role="UNKNOWN">p</XMTok>
                    <XMDual>
                      <XMApp>
                        <XMTok meaning="open-interval"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p3.m4.1"/>
                        <XMRef idref="S3.SS2.SSS0.Px2.p3.m4.2"/>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m4.1">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMApp>
                              <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                              <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m4.2">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                            <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                  <XMApp>
                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="blackboard" meaning="1" role="NUMBER">1</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="greater-than" role="RELOP">&gt;</XMTok>
                          <XMApp>
                            <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="bold italic" fontsize="70%" role="UNKNOWN">m</XMTok>
                              <XMTok font="italic" fontsize="50%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok fontsize="70%" meaning="0.5" role="NUMBER">0.5</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMApp>
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMApp>
                            <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                            <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                        </XMApp>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                    </XMApp>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMDual>
                        <XMRef idref="S3.SS2.SSS0.Px2.p3.m4.3"/>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m4.3">
                            <XMTok meaning="minus" role="ADDOP">-</XMTok>
                            <XMTok meaning="1" role="NUMBER">1</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                              <XMTok font="blackboard" meaning="1" role="NUMBER">1</XMTok>
                              <XMApp>
                                <XMTok fontsize="70%" meaning="greater-than" role="RELOP">&gt;</XMTok>
                                <XMApp>
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                    <XMTok font="bold italic" fontsize="70%" role="UNKNOWN">m</XMTok>
                                    <XMTok font="italic" fontsize="50%" role="UNKNOWN">k</XMTok>
                                  </XMApp>
                                  <XMTok font="italic" fontsize="50%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok fontsize="70%" meaning="0.5" role="NUMBER">0.5</XMTok>
                              </XMApp>
                            </XMApp>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                      <XMDual>
                        <XMRef idref="S3.SS2.SSS0.Px2.p3.m4.4"/>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp xml:id="S3.SS2.SSS0.Px2.p3.m4.4">
                            <XMTok meaning="minus" role="ADDOP">-</XMTok>
                            <XMTok meaning="1" role="NUMBER">1</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                                <XMApp>
                                  <XMTok name="hat" role="OVERACCENT" stretchy="false">^</XMTok>
                                  <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                </XMApp>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>, where <Math mode="inline" tex="\mathbbm{1}_{(\cdot)}" text="1 _ cdot" xml:id="S3.SS2.SSS0.Px2.p3.m5">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="blackboard" meaning="1" role="NUMBER">1</XMTok>
                  <XMDual>
                    <XMRef idref="S3.SS2.SSS0.Px2.p3.m5.1"/>
                    <XMWrap>
                      <XMTok fontsize="70%" role="OPEN" stretchy="false">(</XMTok>
                      <XMTok fontsize="70%" name="cdot" role="MULOP" xml:id="S3.SS2.SSS0.Px2.p3.m5.1">⋅</XMTok>
                      <XMTok fontsize="70%" role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMath>
            </Math> is the indicator function on each individual pixel. Overall, our dynamics model seeks to enforce that objects have zero order motion and maintain shape.</p>
        </para>
<!--  %More␣complex␣physical␣effects␣between␣objects␣include␣gravity␣and␣inter-object␣collisions.␣␣These␣rules␣can␣also␣be␣written␣using␣decoded␣primitives,␣though␣we␣found␣limited␣gains␣to␣adding␣additional␣constraints␣on␣the␣datasets␣we␣test. -->      </paragraph>
      <paragraph inlist="toc" xml:id="S3.SS2.SSS0.Px3">
        <title>Image generative model.</title>
<!--  %****␣method.tex␣Line␣75␣**** -->        <para xml:id="S3.SS2.SSS0.Px3.p1">
          <p>We represent images <Math mode="inline" tex="{\bm{x}}^{t}\in\mathbb{R}^{D}" text="x ^ t element-of R ^ D" xml:id="S3.SS2.SSS0.Px3.p1.m1">
              <XMath>
                <XMApp>
                  <XMTok meaning="element-of" name="in" role="RELOP">∈</XMTok>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="blackboard" role="UNKNOWN">R</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">D</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math> at each time step as spatial Gaussian mixture models, with each mixture model being defined by a segmentation mask <Math mode="inline" tex="m" text="m" xml:id="S3.SS2.SSS0.Px3.p1.m2">
              <XMath>
                <XMTok font="italic" role="UNKNOWN">m</XMTok>
              </XMath>
            </Math> in <Math mode="inline" tex="\bm{M}" text="M" xml:id="S3.SS2.SSS0.Px3.p1.m3">
              <XMath>
                <XMTok font="bold italic" role="UNKNOWN">M</XMTok>
              </XMath>
            </Math> (Section <ref labelref="LABEL:sect:inference"/>). Each corresponding latent <Math mode="inline" tex="{\bm{z}}_{k}" text="z _ k" xml:id="S3.SS2.SSS0.Px3.p1.m4">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
              </XMath>
            </Math> is decoded to a pixel-wise mean <Math mode="inline" tex="\mu_{ik}" text="mu _ (i * k)" xml:id="S3.SS2.SSS0.Px3.p1.m5">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math> and a pixel-wise mask prediction <Math mode="inline" tex="d_{ik}" text="d _ (i * k)" xml:id="S3.SS2.SSS0.Px3.p1.m6">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">d</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math> using a VAE decoder <Math mode="inline" tex="\text{Decode}(\mu_{k},{\bm{d}}_{k}|{\bm{z}}_{k})" text="[Decode] * open-interval@(mu _ k, conditional@(d _ k, z _ k))" xml:id="S3.SS2.SSS0.Px3.p1.m7">
              <XMath>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMText>Decode</XMText>
                  <XMDual>
                    <XMApp>
                      <XMTok meaning="open-interval"/>
                      <XMRef idref="S3.SS2.SSS0.Px3.p1.m7.1"/>
                      <XMRef idref="S3.SS2.SSS0.Px3.p1.m7.2"/>
                    </XMApp>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMApp xml:id="S3.SS2.SSS0.Px3.p1.m7.1">
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok role="PUNCT">,</XMTok>
                      <XMApp xml:id="S3.SS2.SSS0.Px3.p1.m7.2">
                        <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                        </XMApp>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                          <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMath>
            </Math>. We assume each pixel <Math mode="inline" tex="i" text="i" xml:id="S3.SS2.SSS0.Px3.p1.m8">
              <XMath>
                <XMTok font="italic" role="UNKNOWN">i</XMTok>
              </XMath>
            </Math> is independent conditioned on <Math mode="inline" tex="{\bm{z}}" text="z" xml:id="S3.SS2.SSS0.Px3.p1.m9">
              <XMath>
                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
              </XMath>
            </Math>, so that the likelihood becomes</p>
          <equation labels="LABEL:eqn:img" xml:id="S3.E4">
            <tags>
              <tag>(4)</tag>
              <tag role="autoref">Equation 4</tag>
              <tag role="refnum">4</tag>
            </tags>
            <Math mode="display" tex="p({\bm{x}}|{\bm{z}})=\prod_{i=0}^{D}\left(\sum_{k=1}^{K}\left(m_{ik}{\mathcal{%&#10;N}}(x_{i};\mu_{ik},\sigma^{2})\times p_{\theta}(d_{ik}|{\bm{z}}_{k})\right)+m_%&#10;{ib}{\mathcal{N}}(x_{i};\mu_{ib},\sigma_{b}^{2})\times p_{\theta}(d_{ib}|{\bm{%&#10;z}}_{b})\right)" text="p * conditional@(x, z) = ((product _ (i = 0)) ^ D)@(((sum _ (k = 1)) ^ K)@(m _ (i * k) * N * list@(x _ i, mu _ (i * k), sigma ^ 2) * p _ theta * conditional@(d _ (i * k), z _ k)) + m _ (i * b) * N * list@(x _ i, mu _ (i * b), (sigma _ b) ^ 2) * p _ theta * conditional@(d _ (i * b), z _ b))" xml:id="S3.E4.m1">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" role="UNKNOWN">p</XMTok>
                    <XMDual>
                      <XMRef idref="S3.E4.m1.1"/>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp xml:id="S3.E4.m1.1">
                          <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                          <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                          <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                  <XMApp>
                    <XMApp scriptpos="mid">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="mid1"/>
                      <XMApp scriptpos="mid">
                        <XMTok role="SUBSCRIPTOP" scriptpos="mid1"/>
                        <XMTok mathstyle="display" meaning="product" name="prod" role="SUMOP" scriptpos="mid">∏</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                          <XMTok fontsize="70%" meaning="0" role="NUMBER">0</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">D</XMTok>
                    </XMApp>
                    <XMDual>
                      <XMRef idref="S3.E4.m1.2"/>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="true">(</XMTok>
                        <XMApp xml:id="S3.E4.m1.2">
                          <XMTok meaning="plus" role="ADDOP">+</XMTok>
                          <XMApp>
                            <XMApp scriptpos="mid">
                              <XMTok role="SUPERSCRIPTOP" scriptpos="mid2"/>
                              <XMApp scriptpos="mid">
                                <XMTok role="SUBSCRIPTOP" scriptpos="mid2"/>
                                <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                                <XMApp>
                                  <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                  <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                                </XMApp>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                            </XMApp>
                            <XMDual>
                              <XMRef idref="S3.E4.m1.2.1"/>
                              <XMWrap>
                                <XMTok role="OPEN" stretchy="true">(</XMTok>
                                <XMApp xml:id="S3.E4.m1.2.1">
                                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                  <XMApp>
                                    <XMTok meaning="times" role="MULOP">×</XMTok>
                                    <XMApp>
                                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post3"/>
                                        <XMTok font="italic" role="UNKNOWN">m</XMTok>
                                        <XMApp>
                                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                                      <XMDual>
                                        <XMApp>
                                          <XMTok meaning="list"/>
                                          <XMRef idref="S3.E4.m1.2.1.1"/>
                                          <XMRef idref="S3.E4.m1.2.1.2"/>
                                          <XMRef idref="S3.E4.m1.2.1.3"/>
                                        </XMApp>
                                        <XMWrap>
                                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                                          <XMApp xml:id="S3.E4.m1.2.1.1">
                                            <XMTok role="SUBSCRIPTOP" scriptpos="post3"/>
                                            <XMTok font="italic" role="UNKNOWN">x</XMTok>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                          </XMApp>
                                          <XMTok role="PUNCT">;</XMTok>
                                          <XMApp xml:id="S3.E4.m1.2.1.2">
                                            <XMTok role="SUBSCRIPTOP" scriptpos="post3"/>
                                            <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                                            <XMApp>
                                              <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                            </XMApp>
                                          </XMApp>
                                          <XMTok role="PUNCT">,</XMTok>
                                          <XMApp xml:id="S3.E4.m1.2.1.3">
                                            <XMTok role="SUPERSCRIPTOP" scriptpos="post3"/>
                                            <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                                            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                                          </XMApp>
                                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                        </XMWrap>
                                      </XMDual>
                                    </XMApp>
                                    <XMApp>
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post3"/>
                                      <XMTok font="italic" role="UNKNOWN">p</XMTok>
                                      <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMDual>
                                    <XMRef idref="S3.E4.m1.2.1.4"/>
                                    <XMWrap>
                                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                                      <XMApp xml:id="S3.E4.m1.2.1.4">
                                        <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                                        <XMApp>
                                          <XMTok role="SUBSCRIPTOP" scriptpos="post3"/>
                                          <XMTok font="italic" role="UNKNOWN">d</XMTok>
                                          <XMApp>
                                            <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                          </XMApp>
                                        </XMApp>
                                        <XMApp>
                                          <XMTok role="SUBSCRIPTOP" scriptpos="post3"/>
                                          <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                                        </XMApp>
                                      </XMApp>
                                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                    </XMWrap>
                                  </XMDual>
                                </XMApp>
                                <XMTok role="CLOSE" stretchy="true">)</XMTok>
                              </XMWrap>
                            </XMDual>
                          </XMApp>
                          <XMApp>
                            <XMTok meaning="times" role="MULOP">⁢</XMTok>
                            <XMApp>
                              <XMTok meaning="times" role="MULOP">×</XMTok>
                              <XMApp>
                                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                <XMApp>
                                  <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="italic" role="UNKNOWN">m</XMTok>
                                  <XMApp>
                                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                                <XMDual>
                                  <XMApp>
                                    <XMTok meaning="list"/>
                                    <XMRef idref="S3.E4.m1.2.2"/>
                                    <XMRef idref="S3.E4.m1.2.3"/>
                                    <XMRef idref="S3.E4.m1.2.4"/>
                                  </XMApp>
                                  <XMWrap>
                                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                                    <XMApp xml:id="S3.E4.m1.2.2">
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                      <XMTok font="italic" role="UNKNOWN">x</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                    </XMApp>
                                    <XMTok role="PUNCT">;</XMTok>
                                    <XMApp xml:id="S3.E4.m1.2.3">
                                      <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                      <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                                      <XMApp>
                                        <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                                      </XMApp>
                                    </XMApp>
                                    <XMTok role="PUNCT">,</XMTok>
                                    <XMApp xml:id="S3.E4.m1.2.4">
                                      <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                      <XMApp>
                                        <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                        <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                                      </XMApp>
                                      <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
                                    </XMApp>
                                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                                  </XMWrap>
                                </XMDual>
                              </XMApp>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                <XMTok font="italic" role="UNKNOWN">p</XMTok>
                                <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                              </XMApp>
                            </XMApp>
                            <XMDual>
                              <XMRef idref="S3.E4.m1.2.5"/>
                              <XMWrap>
                                <XMTok role="OPEN" stretchy="false">(</XMTok>
                                <XMApp xml:id="S3.E4.m1.2.5">
                                  <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                    <XMTok font="italic" role="UNKNOWN">d</XMTok>
                                    <XMApp>
                                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                                    </XMApp>
                                  </XMApp>
                                  <XMApp>
                                    <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                                    <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                                  </XMApp>
                                </XMApp>
                                <XMTok role="CLOSE" stretchy="false">)</XMTok>
                              </XMWrap>
                            </XMDual>
                          </XMApp>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="true">)</XMTok>
                      </XMWrap>
                    </XMDual>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math>
          </equation>
          <p>for background component <Math mode="inline" tex="m_{b}" text="m _ b" xml:id="S3.SS2.SSS0.Px3.p1.m10">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">m</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                </XMApp>
              </XMath>
            </Math>, <Math mode="inline" tex="\mu_{b}" text="mu _ b" xml:id="S3.SS2.SSS0.Px3.p1.m11">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                </XMApp>
              </XMath>
            </Math>, <Math mode="inline" tex="d_{b}" text="d _ b" xml:id="S3.SS2.SSS0.Px3.p1.m12">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">d</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                </XMApp>
              </XMath>
            </Math> and object components <Math mode="inline" tex="m_{k}" text="m _ k" xml:id="S3.SS2.SSS0.Px3.p1.m13">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">m</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
              </XMath>
            </Math>, <Math mode="inline" tex="\mu_{k}" text="mu _ k" xml:id="S3.SS2.SSS0.Px3.p1.m14">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
              </XMath>
            </Math>, <Math mode="inline" tex="d_{k}" text="d _ k" xml:id="S3.SS2.SSS0.Px3.p1.m15">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">d</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
              </XMath>
            </Math>, where <Math mode="inline" tex="p_{\theta}(d_{ik}|{\bm{z}}_{k})=p_{\theta}(d_{ik}=m_{ik}|{\bm{z}}_{k})" xml:id="S3.SS2.SSS0.Px3.p1.m16">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">p</XMTok>
                  <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" role="UNKNOWN">d</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">p</XMTok>
                  <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" role="UNKNOWN">d</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" role="UNKNOWN">m</XMTok>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                    </XMApp>
                  </XMApp>
                  <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
              </XMath>
            </Math>, is the probability that decoded mask from the latent matches the ground truth mask for the mixture. We use <Math mode="inline" tex="\sigma=0.11" text="sigma = 0.11" xml:id="S3.SS2.SSS0.Px3.p1.m17">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                  <XMTok meaning="0.11" role="NUMBER">0.11</XMTok>
                </XMApp>
              </XMath>
            </Math> and <Math mode="inline" tex="\sigma_{b}=0.07" text="sigma _ b = 0.07" xml:id="S3.SS2.SSS0.Px3.p1.m18">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                  </XMApp>
                  <XMTok meaning="0.07" role="NUMBER">0.07</XMTok>
                </XMApp>
              </XMath>
            </Math> to break symmetry between object and background components, encouraging the background to model the more uniform image components <cite class="ltx_citemacro_citep">(<bibref bibrefs="burgess2019monet" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>. Our overall loss encourages the decomposition of an image into a set of reusable sub-components, as well as a large background.</p>
        </para>
      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="S3.SS3">
      <tags>
        <tag>3.3</tag>
        <tag role="autoref">subsection 3.3</tag>
        <tag role="refnum">3.3</tag>
        <tag role="typerefnum">§3.3</tag>
      </tags>
      <title><tag close=" ">3.3</tag>Training Loss</title>
      <para xml:id="S3.SS3.p1">
        <p>Our overall system is trained to maximize the likelihood of both physical object and image generative models. Our loss consists of <Math mode="inline" tex="{\mathcal{L}}({\bm{x}}^{t})={\mathcal{L}}_{\text{Physics}}+{\mathcal{L}}_{%&#10;\text{Image}}+{\mathcal{L}}_{\text{KL}}" text="L * x ^ t = L _ [Physics] + L _ [Image] + L _ [KL]" xml:id="S3.SS3.p1.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                  <XMDual>
                    <XMRef idref="S3.SS3.p1.m1.1"/>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMApp xml:id="S3.SS3.p1.m1.1">
                        <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                      </XMApp>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
                <XMApp>
                  <XMTok meaning="plus" role="ADDOP">+</XMTok>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                    <XMText><text fontsize="70%">Physics</text></XMText>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                    <XMText><text fontsize="70%">Image</text></XMText>
                  </XMApp>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                    <XMText><text fontsize="70%">KL</text></XMText>
                  </XMApp>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>, maximizing the likelihood of physical dynamics, images, and variational bound. Our image loss is defined to be <Math mode="inline" tex="{\mathcal{L}}_{\text{Image}}=-\log\left(p({\bm{x}}^{t}|{\bm{z}})\right)" text="L _ [Image] = - logarithm@(p * conditional@(x ^ t, z))" xml:id="S3.SS3.p1.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                  <XMText><text fontsize="70%">Image</text></XMText>
                </XMApp>
                <XMApp>
                  <XMTok meaning="minus" role="ADDOP">-</XMTok>
                  <XMDual>
                    <XMApp>
                      <XMRef idref="S3.SS3.p1.m2.1"/>
                      <XMRef idref="S3.SS3.p1.m2.2"/>
                    </XMApp>
                    <XMApp>
                      <XMTok meaning="logarithm" role="OPFUNCTION" xml:id="S3.SS3.p1.m2.1">log</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="true">(</XMTok>
                        <XMApp xml:id="S3.SS3.p1.m2.2">
                          <XMTok meaning="times" role="MULOP">⁢</XMTok>
                          <XMTok font="italic" role="UNKNOWN">p</XMTok>
                          <XMDual>
                            <XMRef idref="S3.SS3.p1.m2.2.1"/>
                            <XMWrap>
                              <XMTok role="OPEN" stretchy="false">(</XMTok>
                              <XMApp xml:id="S3.SS3.p1.m2.2.1">
                                <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                                <XMApp>
                                  <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                                  <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                                </XMApp>
                                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                              </XMApp>
                              <XMTok role="CLOSE" stretchy="false">)</XMTok>
                            </XMWrap>
                          </XMDual>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="true">)</XMTok>
                      </XMWrap>
                    </XMApp>
                  </XMDual>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>, based on Eqn. <ref labelref="LABEL:eqn:img"/>. Our physics loss is defined to be <Math mode="inline" tex="{\mathcal{L}}_{\text{Physics}}=-\sum_{k=1}^{K}\log p\left({\bm{t}}_{k}^{t},{%&#10;\bm{s}}_{k}^{t},{\bm{q}}_{k}^{t},{\bm{m}}_{k}^{t}\right)" text="L _ [Physics] = - ((sum _ (k = 1)) ^ K)@(logarithm@(p) * vector@((t _ k) ^ t, (s _ k) ^ t, (q _ k) ^ t, (m _ k) ^ t))" xml:id="S3.SS3.p1.m3">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                  <XMText><text fontsize="70%">Physics</text></XMText>
                </XMApp>
                <XMApp>
                  <XMTok meaning="minus" role="ADDOP">-</XMTok>
                  <XMApp>
                    <XMApp>
                      <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok mathstyle="text" meaning="sum" role="SUMOP" scriptpos="post">∑</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                    </XMApp>
                    <XMApp>
                      <XMTok meaning="times" role="MULOP">⁢</XMTok>
                      <XMApp>
                        <XMTok meaning="logarithm" role="OPFUNCTION">log</XMTok>
                        <XMTok font="italic" role="UNKNOWN">p</XMTok>
                      </XMApp>
                      <XMDual>
                        <XMApp>
                          <XMTok meaning="vector"/>
                          <XMRef idref="S3.SS3.p1.m3.1"/>
                          <XMRef idref="S3.SS3.p1.m3.2"/>
                          <XMRef idref="S3.SS3.p1.m3.3"/>
                          <XMRef idref="S3.SS3.p1.m3.4"/>
                        </XMApp>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="true">(</XMTok>
                          <XMApp xml:id="S3.SS3.p1.m3.1">
                            <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok role="PUNCT">,</XMTok>
                          <XMApp xml:id="S3.SS3.p1.m3.2">
                            <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok role="PUNCT">,</XMTok>
                          <XMApp xml:id="S3.SS3.p1.m3.3">
                            <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok role="PUNCT">,</XMTok>
                          <XMApp xml:id="S3.SS3.p1.m3.4">
                            <XMTok role="SUPERSCRIPTOP" scriptpos="post2"/>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post2"/>
                              <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                          </XMApp>
                          <XMTok role="CLOSE" stretchy="true">)</XMTok>
                        </XMWrap>
                      </XMDual>
                    </XMApp>
                  </XMApp>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>, based on Eqn. <ref labelref="LABEL:eqn:physics"/>, which enforces that decoded primitives are physically consistent. The KL loss is</p>
        <equationgroup xml:id="S3.E5">
          <tags>
            <tag>(5)</tag>
            <tag role="autoref">Equation 5</tag>
            <tag role="refnum">5</tag>
          </tags>
          <equation xml:id="S3.E5X">
            <MathFork>
              <Math tex="\displaystyle{\mathcal{L}}_{\text{KL}}=\beta\left(\sum_{k=1}^{K}{\text{KL}(%&#10;\text{Encode}({\bm{z}}_{k}^{t}|{\bm{x}}^{t},{\bm{m}}_{k}^{t})\;||\;p(z))}+%&#10;\text{KL}(\text{Encode}({\bm{z}}_{b}^{t}|{\bm{x}}^{t},{\bm{m}}_{b}^{t})\;||\;p%&#10;(z))\right)+" xml:id="S3.E5X.m1">
                <XMath>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post7"/>
                    <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                    <XMText><text fontsize="70%">KL</text></XMText>
                  </XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMTok font="italic" name="beta" role="UNKNOWN">β</XMTok>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="true">(</XMTok>
                    <XMApp scriptpos="mid">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="mid8"/>
                      <XMApp scriptpos="mid">
                        <XMTok role="SUBSCRIPTOP" scriptpos="mid8"/>
                        <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                    </XMApp>
                    <XMText>KL</XMText>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMText>Encode</XMText>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post9"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post9"/>
                            <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post9"/>
                          <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post9"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post9"/>
                            <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                      <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                      <XMTok font="italic" role="UNKNOWN">p</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMTok font="italic" role="UNKNOWN" xml:id="S3.E5X.m1.1">z</XMTok>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                    <XMText>KL</XMText>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMText>Encode</XMText>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post8"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                            <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="PUNCT">,</XMTok>
                        <XMApp>
                          <XMTok role="SUPERSCRIPTOP" scriptpos="post8"/>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                            <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                            <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                      <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                      <XMTok font="italic" role="UNKNOWN">p</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMTok font="italic" role="UNKNOWN">z</XMTok>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                    <XMTok role="CLOSE" stretchy="true">)</XMTok>
                  </XMWrap>
                  <XMTok meaning="plus" role="ADDOP">+</XMTok>
                </XMath>
              </Math>
              <MathBranch>
                <td align="right"><Math tex="\displaystyle{\mathcal{L}}_{\text{KL}}=" text="L _ [KL] = absent" xml:id="S3.E5X.m2">
                    <XMath>
                      <XMApp>
                        <XMTok meaning="equals" role="RELOP">=</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post7"/>
                          <XMTok font="caligraphic" role="UNKNOWN">L</XMTok>
                          <XMText><text fontsize="70%">KL</text></XMText>
                        </XMApp>
                        <XMTok meaning="absent"/>
                      </XMApp>
                    </XMath>
                  </Math></td>
                <td align="left"><Math tex="\displaystyle\beta\left(\sum_{k=1}^{K}{\text{KL}(\text{Encode}({\bm{z}}_{k}^{t%&#10;}|{\bm{x}}^{t},{\bm{m}}_{k}^{t})\;||\;p(z))}+\text{KL}(\text{Encode}({\bm{z}}_%&#10;{b}^{t}|{\bm{x}}^{t},{\bm{m}}_{b}^{t})\;||\;p(z))\right)+" xml:id="S3.E5X.m3">
                    <XMath>
                      <XMTok font="italic" name="beta" role="UNKNOWN">β</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="true">(</XMTok>
                        <XMApp scriptpos="mid">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="mid8"/>
                          <XMApp scriptpos="mid">
                            <XMTok role="SUBSCRIPTOP" scriptpos="mid8"/>
                            <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                            <XMApp>
                              <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                        </XMApp>
                        <XMText>KL</XMText>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMText>Encode</XMText>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post9"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post9"/>
                                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post9"/>
                              <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post9"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post9"/>
                                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                          <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                          <XMTok font="italic" role="UNKNOWN">p</XMTok>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMTok font="italic" role="UNKNOWN" xml:id="S3.E5X.m3.1">z</XMTok>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                        <XMTok meaning="plus" role="ADDOP">+</XMTok>
                        <XMText>KL</XMText>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMText>Encode</XMText>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post8"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                                <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="PUNCT">,</XMTok>
                            <XMApp>
                              <XMTok role="SUPERSCRIPTOP" scriptpos="post8"/>
                              <XMApp>
                                <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                                <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                              </XMApp>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">t</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                          <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                          <XMTok font="italic" role="UNKNOWN">p</XMTok>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMTok font="italic" role="UNKNOWN">z</XMTok>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                        <XMTok role="CLOSE" stretchy="true">)</XMTok>
                      </XMWrap>
                      <XMTok meaning="plus" role="ADDOP">+</XMTok>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
          <equation xml:id="S3.E5Xa">
            <MathFork>
              <Math tex="\displaystyle\gamma\left(\sum_{k=1}^{K}\text{KL}(q_{\psi}({\bm{d}}_{k}|{\bm{x}%&#10;})\;||\;p_{\theta}({\bm{d}}_{k}|{\bm{z}}_{k}))+\text{KL}(q_{\psi}({\bm{d}}_{b}%&#10;|{\bm{x}})\;||\;p_{\theta}({\bm{d}}_{b}|{\bm{z}}_{b}))\right)," xml:id="S3.E5Xa.m1">
                <XMath>
                  <XMTok font="italic" name="gamma" role="UNKNOWN">γ</XMTok>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="true">(</XMTok>
                    <XMApp scriptpos="mid">
                      <XMTok role="SUPERSCRIPTOP" scriptpos="mid8"/>
                      <XMApp scriptpos="mid">
                        <XMTok role="SUBSCRIPTOP" scriptpos="mid8"/>
                        <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                        <XMApp>
                          <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                          <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                        </XMApp>
                      </XMApp>
                      <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                    </XMApp>
                    <XMText>KL</XMText>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                        <XMTok font="italic" role="UNKNOWN">q</XMTok>
                        <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                        </XMApp>
                        <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                        <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                        <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                      <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                        <XMTok font="italic" role="UNKNOWN">p</XMTok>
                        <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                        </XMApp>
                        <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                    <XMText>KL</XMText>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                        <XMTok font="italic" role="UNKNOWN">q</XMTok>
                        <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                        </XMApp>
                        <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                        <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                        <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                      <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                        <XMTok font="italic" role="UNKNOWN">p</XMTok>
                        <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                      </XMApp>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="false">(</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                        </XMApp>
                        <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                        <XMApp>
                          <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                          <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                        </XMApp>
                        <XMTok role="CLOSE" stretchy="false">)</XMTok>
                      </XMWrap>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                    <XMTok role="CLOSE" stretchy="true">)</XMTok>
                  </XMWrap>
                  <XMTok role="PUNCT">,</XMTok>
                </XMath>
              </Math>
              <MathBranch>
                <td/>
                <td align="left"><Math tex="\displaystyle\gamma\left(\sum_{k=1}^{K}\text{KL}(q_{\psi}({\bm{d}}_{k}|{\bm{x}%&#10;})\;||\;p_{\theta}({\bm{d}}_{k}|{\bm{z}}_{k}))+\text{KL}(q_{\psi}({\bm{d}}_{b}%&#10;|{\bm{x}})\;||\;p_{\theta}({\bm{d}}_{b}|{\bm{z}}_{b}))\right)," xml:id="S3.E5Xa.m2">
                    <XMath>
                      <XMTok font="italic" name="gamma" role="UNKNOWN">γ</XMTok>
                      <XMWrap>
                        <XMTok role="OPEN" stretchy="true">(</XMTok>
                        <XMApp scriptpos="mid">
                          <XMTok role="SUPERSCRIPTOP" scriptpos="mid8"/>
                          <XMApp scriptpos="mid">
                            <XMTok role="SUBSCRIPTOP" scriptpos="mid8"/>
                            <XMTok mathstyle="display" meaning="sum" role="SUMOP" scriptpos="mid">∑</XMTok>
                            <XMApp>
                              <XMTok fontsize="70%" meaning="equals" role="RELOP">=</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                              <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
                            </XMApp>
                          </XMApp>
                          <XMTok font="italic" fontsize="70%" role="UNKNOWN">K</XMTok>
                        </XMApp>
                        <XMText>KL</XMText>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                            <XMTok font="italic" role="UNKNOWN">q</XMTok>
                            <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                            <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                            <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                          <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                            <XMTok font="italic" role="UNKNOWN">p</XMTok>
                            <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                        <XMTok meaning="plus" role="ADDOP">+</XMTok>
                        <XMText>KL</XMText>
                        <XMWrap>
                          <XMTok role="OPEN" stretchy="false">(</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                            <XMTok font="italic" role="UNKNOWN">q</XMTok>
                            <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                            </XMApp>
                            <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                            <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                            <XMTok role="CLOSE" rpadding="2.8pt" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                          <XMTok role="VERTBAR" rpadding="2.8pt" stretchy="false">|</XMTok>
                          <XMApp>
                            <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                            <XMTok font="italic" role="UNKNOWN">p</XMTok>
                            <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                          </XMApp>
                          <XMWrap>
                            <XMTok role="OPEN" stretchy="false">(</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                            </XMApp>
                            <XMTok role="VERTBAR" stretchy="false">|</XMTok>
                            <XMApp>
                              <XMTok role="SUBSCRIPTOP" scriptpos="post8"/>
                              <XMTok font="bold italic" role="UNKNOWN">z</XMTok>
                              <XMTok font="italic" fontsize="70%" role="UNKNOWN">b</XMTok>
                            </XMApp>
                            <XMTok role="CLOSE" stretchy="false">)</XMTok>
                          </XMWrap>
                          <XMTok role="CLOSE" stretchy="false">)</XMTok>
                        </XMWrap>
                        <XMTok role="CLOSE" stretchy="true">)</XMTok>
                      </XMWrap>
                      <XMTok role="PUNCT">,</XMTok>
                    </XMath>
                  </Math></td>
              </MathBranch>
            </MathFork>
          </equation>
        </equationgroup>
        <p>enforcing the variational lower bound on likelihood <cite class="ltx_citemacro_citep">(<bibref bibrefs="Kingma2014Semi" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, where for brevity, we use <Math mode="inline" tex="q_{\psi}({\bm{d}}_{k}|{\bm{x}})" text="q _ psi * conditional@(d _ k, x)" xml:id="S3.SS3.p1.m4">
            <XMath>
              <XMApp>
                <XMTok meaning="times" role="MULOP">⁢</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">q</XMTok>
                  <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                </XMApp>
                <XMDual>
                  <XMRef idref="S3.SS3.p1.m4.1"/>
                  <XMWrap>
                    <XMTok role="OPEN" stretchy="false">(</XMTok>
                    <XMApp xml:id="S3.SS3.p1.m4.1">
                      <XMTok meaning="conditional" role="MODIFIEROP" stretchy="false">|</XMTok>
                      <XMApp>
                        <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                        <XMTok font="bold italic" role="UNKNOWN">d</XMTok>
                        <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                      </XMApp>
                      <XMTok font="bold italic" role="UNKNOWN">x</XMTok>
                    </XMApp>
                    <XMTok role="CLOSE" stretchy="false">)</XMTok>
                  </XMWrap>
                </XMDual>
              </XMApp>
            </XMath>
          </Math> to represent the mask generation process in Section <ref labelref="LABEL:sect:inference"/>, and <Math mode="inline" tex="p(z)={\mathcal{N}}(0,1)" text="p * z = N * open-interval@(0, 1)" xml:id="S3.SS3.p1.m5">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMTok font="italic" role="UNKNOWN">p</XMTok>
                  <XMDual>
                    <XMRef idref="S3.SS3.p1.m5.1"/>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMTok font="italic" role="UNKNOWN" xml:id="S3.SS3.p1.m5.1">z</XMTok>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                  <XMDual>
                    <XMApp>
                      <XMTok meaning="open-interval"/>
                      <XMRef idref="S3.SS3.p1.m5.2"/>
                      <XMRef idref="S3.SS3.p1.m5.3"/>
                    </XMApp>
                    <XMWrap>
                      <XMTok role="OPEN" stretchy="false">(</XMTok>
                      <XMTok meaning="0" role="NUMBER" xml:id="S3.SS3.p1.m5.2">0</XMTok>
                      <XMTok role="PUNCT">,</XMTok>
                      <XMTok meaning="1" role="NUMBER" xml:id="S3.SS3.p1.m5.3">1</XMTok>
                      <XMTok role="CLOSE" stretchy="false">)</XMTok>
                    </XMWrap>
                  </XMDual>
                </XMApp>
              </XMApp>
            </XMath>
          </Math> is the prior.
<!--  %for␣latents␣inferred␣on␣both␣background␣and␣foreground␣components. 
     %\gL(\theta,␣\psi,␣\phi,␣\vx^t)␣&amp;=␣-␣\log␣\sum_{k=1}^K␣(\vm_k^t␣p_\theta(\vx^t|\vz_k^t)␣+␣p(\vc_k^t␣|␣\vm_k^t))␣␣\\
     %&amp;+␣\beta␣\sum_{k=1}^K␣\kld{q_\phi(\vz_k^t␣|\vx^t,␣\vm_k^t)}{p(z)})␣\\
     %&amp;␣-␣\log(m_b^t␣p_\theta(\vx^t|\vz_b^t)␣+␣␣p(\vc_b^t|␣\vm_b^t)␣\\
     %&amp;+␣\beta␣\kld{q_\phi(\vz_b^t␣|\vx^t,␣\vm_b^t)}{p(z)})␣\\
     %&amp;+␣\sum_{k=1}^K␣L_{\text{phys}}␣(\vm_k^t|\vm_k^{t-1},␣\ldots,␣\vm_k^{0}).
     %\end{align*}--></p>
      </para>
      <para xml:id="S3.SS3.p2">
        <p>Our training paradigm consists of two different steps. We first maximize the likelihood of the model under the image generation objective. After qualitatively observing object-like masks (roughly after 100,000 iterations), we switch to maximizing the likelihood of the model under both the generation and physical plausibility objectives. Alternatively, we found that switching at loss convergence also worked well. We find that enforcing physical consistency during early stages of training detrimental, as the model has not discovered object-like primitives yet. We use the RMSprop optimizer with a learning rate of <Math mode="inline" tex="10^{-4}" text="10 ^ (- 4)" xml:id="S3.SS3.p2.m1">
            <XMath>
              <XMApp>
                <XMTok role="SUPERSCRIPTOP" scriptpos="post1"/>
                <XMTok meaning="10" role="NUMBER">10</XMTok>
                <XMApp>
                  <XMTok fontsize="70%" meaning="minus" role="ADDOP">-</XMTok>
                  <XMTok fontsize="70%" meaning="4" role="NUMBER">4</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math> within the PyTorch framework <cite class="ltx_citemacro_citep">(<bibref bibrefs="pytorch" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> to train our models.
<!--  %****␣method.tex␣Line␣100␣**** --></p>
      </para>
<!--  %This␣is␣done␣throu 
     %Let␣$\rvx$␣represent␣a␣video␣of␣moving␣objects␣in␣a␣physical␣scene,␣consisting␣of␣a␣set␣of␣images␣$x^{1:T}$␣for␣each␣time-step␣$1$␣through␣$T$.␣In␣each␣particular␣time␣step␣$t$,␣there␣exists␣a␣set␣of␣different␣object␣states␣$z_{1:K}^t$,␣such␣that␣the␣conditional␣distribution␣of␣each␣object␣state␣$\Phi(z_i^{t+1}|z_0^t,␣\ldots,␣z_K^t)$␣at␣the␣next␣time␣step␣follows␣a␣transition␣function␣that␣approximates␣the␣laws␣of␣physics␣$\Phi$.
     %In␣our␣work,␣we␣seek␣to␣learn␣to␣infer␣the␣underlying␣object␣states␣at␣time␣step␣$t$,␣$z_{1:K}^{t}$,␣given␣only␣image␣observations␣at␣the␣corresponding␣time␣step␣$t$,␣$x^{t}$.
     %To␣accomplish␣this␣goal,␣we␣propose␣\model,␣which␣is␣comprised␣of␣three␣parts:␣(i)␣a␣\textit{generative␣model}␣($p_\theta␣(x,␣Z)$)␣from␣a␣Gaussian␣Mixture␣Model␣(GMM)␣that␣learns␣to␣represent␣an␣image␣as␣a␣mixture␣of␣latent␣components␣$Z$,␣(ii)␣a␣\textit{projection␣model}␣($g(z_i␣|␣Z_i)$)␣that␣transforms␣these␣latent␣components␣into␣an␣explicit␣world␣state,␣and␣(iii)␣a␣\textit{physics␣model}␣($\Phi$)␣that␣predicts␣future␣observations␣as␣a␣way␣to␣regularize␣the␣object␣detection␣to␣be␣consistent␣with␣the␣world␣dynamics.
     %For␣generality␣in␣object␣representations,␣we␣treat␣each␣individual␣object␣state␣as␣a␣cuboid␣with␣scale␣parameter␣$s_i$,␣translation␣parameter␣$t_i$,␣rotations␣parameter␣$q_i$,␣and␣associated␣segmentation␣mask␣$m_i$.␣We␣cast␣the␣learning␣problem␣as␣maximum␣likelihood␣inference␣problem,␣where␣we␣seek␣to␣learn␣a␣latent␣generative␣model␣$p_\theta(x,␣Z)$␣to␣jointly␣maximize␣the␣evidence␣lower␣bound␣and␣plausibility␣of␣physical␣interactions␣given␣by
     %\begin{equation}
     %\begin{split}
     %\sum_i␣\log␣\Phi(g(p(Z_i|x^t))␣|␣z_{1..K}^{1..t-1})␣dZ)␣\\
     %+␣\sum_t␣(\log␣(\int␣p_\theta(x^{t},␣Z^t)␣dZ)
     %\end{split}
     %\end{equation}
     %%where␣$\Phi$␣denotes␣a␣conditional␣likelihood␣distribution␣of␣object␣states␣under␣physics␣(as␣detailed␣in␣\sect{sect:physics}),␣and␣$g$␣is␣a␣mapping␣from␣underlying␣latent␣states␣$Z$␣to␣explicit␣object␣states␣$z$.
     %Since␣the␣above␣expression␣is␣intractable,␣we␣instead␣optimize␣the␣lower␣bound␣on␣the␣likelihood
     %\begin{equation}
     %\sum_t␣\left(\E_{q(z|x^t)}[p_\theta(x^t|z)]␣+␣\sum_i␣\log␣\Phi\left(f(p(Z|x^t))|␣z_{1..K}^{1..t-1}\right)␣dZ\right)
     %\end{equation}
     %using␣a␣fixed␣projection␣operator␣$f$␣(\sect{sect:projection})␣which␣approximates␣$g$␣to␣map␣from␣latent␣states␣$Z$␣to␣object␣states␣$z$.␣Such␣a␣model␣can␣be␣either␣a␣pretrained␣network␣or␣a␣differentiable␣back-projection␣module␣(from␣2D␣to␣3D).
     %%,␣which␣maps␣decoded␣latent␣states␣␣to␣explicit␣object␣states␣states␣and␣variational␣approximation␣to␣approximate␣marginalization␣over␣$z$.%␣We␣detail␣our␣generative␣model␣in␣\sect{sect:inference}.
     %\subsection{Generative␣Model}
     %****␣method.tex␣Line␣125␣****
     %\label{sect:inference}
     %For␣our␣generative␣model,␣we␣represent␣the␣likelihood␣of␣an␣image␣$x^t$␣as␣the␣composition␣of␣a␣series␣of␣$S$␣overlapping␣sub-patches␣$s_i$,␣so␣that
     %\begin{equation}
     %p(x^t)␣=␣␣\prod_{i=1}^S␣p(s_i^t).
     %\end{equation}
     %We␣learn␣a␣generative␣model␣over␣each␣sub-patch␣$s_i$␣so␣that␣the␣overall␣likelihood␣of␣an␣image␣is␣parameterized␣by␣a␣set␣of␣$K$␣different␣$Z_i$␣components,␣each␣of␣which␣represents␣a␣Gaussian␣Mixture␣Model,␣so␣the␣likelihood␣of␣patch␣$s_i$:
     %\begin{equation}
     %p(s_i^t|Z)␣=␣\sum_{i=1}^P␣\sum_{j=1}^K␣m_{ij}␣\gN(s_i,␣\mu_{ij},␣\sigma^2),
     %\end{equation}
     %where␣$m_{ij}$␣denotes␣a␣segmentation␣mask␣over␣an␣image␣and␣$\mu_{ij}$␣corresponds␣pixel␣mean␣predictions,␣such␣that␣$\sum_j␣m_{ij}␣=␣1$
     %Taking␣inspiration␣from␣the␣approach␣in␣\citet{burgess2019monet},␣we␣represent␣$\gN(s_i,␣\mu_{ij},␣\sigma^2)$␣as␣a␣VAE␣\cite{Kingma2014Auto},␣with␣corresponding␣latents␣$y_i$,␣so␣that␣each␣component␣has␣a␣latent␣state␣$Z_i␣=␣(m_i,␣y_i)$.␣Following␣\citet{burgess2019monet},␣we␣decode␣each␣of␣the␣$Z_i$␣for␣each␣of␣the␣$K$␣components␣sequentially.
     %We␣initialize␣context␣$c_0$,␣which␣represents␣the␣amount␣of␣context␣in␣the␣image␣yet␣to␣be␣explained␣to␣be␣1,␣and␣at␣each␣step␣we␣decode␣the␣attention␣mask
     %\begin{equation}
     %m_i␣=␣c_{i-1}␣\alpha_{\psi}(x;c_{i-1}),
     %\end{equation}
     %using␣a␣parametrized␣attention␣network␣$\alpha_{\psi}$.␣We␣correspondingly␣update␣the␣context␣in␣the␣image␣by
     %\begin{equation}
     %c_i␣=␣c_{i-1}␣(1␣-␣\alpha_{\psi}(x;c_{i-1}).
     %\end{equation}
     %Given␣a␣decoded␣mask,␣we␣decode␣variational␣variables␣$y_i$␣via
     %\begin{equation}
     %****␣method.tex␣Line␣150␣****
     %y_i␣\sim␣q_\phi␣(y_i|m_i,␣x)
     %\end{equation}
     %for␣variational␣encoder␣$q_\phi$.␣We␣parameterize␣the␣output␣likelihood␣as
     %\begin{equation}
     %\gN(x,␣\mu_{ij},␣\sigma^2)␣=␣p_{\theta}(x|y_i),
     %\end{equation}
     %using␣a␣variation␣decoder␣$p_{\theta}$.
     %We␣found␣that␣such␣a␣decomposition␣of␣likelihood␣across␣sub-patches␣of␣a␣component␣image␣allowed␣for␣better␣decomposition␣of␣latents␣state␣$Z$,␣as␣patches␣often␣only␣contain␣a␣small␣number␣of␣objects,␣allowing␣POD-Net␣to␣decompose␣all␣instances␣of␣objects.␣Given␣latent␣states␣for␣$Z_i$␣for␣each␣sub-patch␣$s_i$,␣we␣can␣obtain␣latent␣states␣$Z$␣for␣the␣entire␣image␣$x$␣by␣overlaying␣the␣each␣predicted␣mask␣in␣$Z_i$␣of␣each␣patch␣on␣to␣each␣other␣(c.f.␣\fig{fig:overview}).
     %\subsection{Projection␣Model}
     %\label{sect:projection}
     %Our␣projection␣model␣$f$␣maps␣from␣latent␣states␣$Z_i$␣to␣explicit␣object␣states␣$z_i$.␣␣We␣decompose␣the␣projection␣model␣as
     %\begin{equation}
     %g(z_i|Z_i)␣=␣q(z_i|m_i)p(m_i|Z_i),
     %\end{equation}
     %where␣$m_i$␣corresponds␣to␣a␣segmentation␣masks␣from␣the␣latent␣state␣$Z_i$.␣We␣wish␣the␣resultant␣projection␣model␣to␣result␣in␣a␣physically␣meaningful␣representation␣of␣scene,␣and␣thus␣apply␣steps␣to␣bias␣the␣underlying␣projection␣model.␣We␣first␣constrain␣the␣projection␣model,␣$p(m_i|Z_i)$␣to␣directly␣be␣the␣segmentation␣mask␣in␣\sect{sect:inference}.␣We␣further␣set␣$q(z_i|m_i)$␣to␣represent␣a␣perspective␣mapping.␣To␣do␣this,␣we␣can␣either␣use␣a␣pre-trained␣model,␣or␣a␣differentiable␣back-projection␣module␣(from␣2D␣to␣3D).
     %\subsection{Physics␣Model}
     %\label{sect:physics}
     %Our␣physics␣model␣represent␣physics␣$\Phi$␣as␣a␣conditional␣distribution␣of␣object␣state␣$z_i^T$␣consisting␣of␣parameters␣$s_i^T,␣t_i^T,␣q_i^T,␣m_i^T$␣representing␣size,␣translation,␣rotation␣and␣segmentation␣mask␣respectively,␣under␣previous␣observation␣$z_{1..K}^{1..t-1}$.␣We␣decompose␣this␣distribution␣per␣object␣state␣and␣construct␣our␣conditional␣distribution␣based␣off␣an␣inferred␣state␣$z_i^{t’}$␣per␣object␣given␣past␣observations:
     %****␣method.tex␣Line␣175␣****
     %\begin{align}
     %\Phi(z_{1..K}^t␣|␣z_{1..K}^{1..t-1})␣&amp;=␣␣\prod_i␣p(z_i^t|z_{1..K}^{1..t-1})␣\\
     %&amp;=␣␣\prod_i␣p(z_i^t|z_i^{t’}).
     %\end{align}
     %To␣compute␣the␣inferred␣state␣$z_i^{t’}␣=␣(s_i^{t’},␣t_i^{t’},␣q_i^{t’},␣m_i^{t’})$␣from␣past␣observations,␣we␣assume␣objects␣will␣have␣consistent␣size␣and␣translate␣and␣rotate␣smoothly:
     %\begin{equation}
     %s_i^{t’}␣=␣\text{mean}(s_i^{1..t-1}),
     %\end{equation}
     %\begin{equation}
     %t_i^{t’}␣=␣t_i^{t-1}␣+␣\text{mean}(t_i^{k}␣-␣t_i^{k-1}),
     %\end{equation}
     %\begin{equation}
     %q_i^{t’}␣=␣q_i^{t-1}␣+␣\text{mean}(q_i^{k}␣-␣q_i^{k-1}).
     %\end{equation}
     %To␣compute␣the␣inferred␣segmentation␣mask␣$m_i^{t’}$,␣we␣use␣an␣inverse␣perspective␣model␣to␣the␣one␣detailed␣in␣\sect{sect:projection}.␣In␣particular,␣we␣train␣a␣separate␣inverse␣model␣$r(m_i|s_i,␣t_i,␣q_i)$,␣on␣the␣same␣data␣set␣of␣single␣cuboid␣scenes.␣While␣our␣inverse␣model␣generates␣candidate␣segmentation␣masks␣per␣object,␣these␣segmentation␣masks␣may␣overlap␣each␣other.␣To␣obtain␣distinct␣segmentation␣we␣overlay␣these␣masks␣per␣object␣on␣each␣other,␣assigning␣a␣pixel␣to␣a␣particular␣segmentation␣based␣on␣nearest␣depth,␣as␣determined␣from␣the␣absolute␣translation␣coordinates␣of␣an␣object:
     %\begin{equation}
     %m_i^{t’}␣=␣\text{overlay}␣(r(s_i^{t’},␣t_i^{t’},␣q_i^{t’})).
     %\end{equation}
     %Given␣an␣inferred␣state␣$z_i^{t’}$␣we␣decompose␣the␣probability␣distribution␣$p(z_i^t|z_i^{t’})$␣as
     %\begin{equation}
     %p(z_i^t|z_i^{t’})␣=␣p(s_i^t|s_i^{t’})␣p(t_i^t|t_i^{t’})␣p(q_i^t|q_i^{t’})␣p(m_i^t|m_i^{t’}),
     %\end{equation}
     %where␣$p(s_i^t|s_i^{t’})$,␣$p(t_i^t|t_i^{t’})$,␣$p(q_i^t|q_i^{t’})$␣are␣represented␣by␣a␣Gaussian␣distribution.␣To␣represent␣$p(m_i^t|m_i^{t’})$,␣since␣both␣$m_i^t$␣and␣$m_i^{t’}$␣are␣continuous␣variables,␣we␣threshold␣both␣variables␣to␣binary␣values,␣where␣values␣larger␣than␣0.5␣are␣set␣to␣1␣and␣values␣less␣than␣0.5␣are␣set␣to␣0:
     %****␣method.tex␣Line␣200␣****
     %\begin{equation}
     %p(m_i^t|m_i^{t’})␣=␣\text{thresh}(m_i^t)␣\log␣(m_i^{t’})␣+␣\text{thresh}(m_i^{t’})␣\log␣(m_i^{t})
     %\end{equation}
     %We␣note␣that␣since␣the␣generative,␣projection,␣and␣physics␣models␣are␣all␣fully␣differentiable,␣the␣above␣likelihood␣allows␣inference␣from␣physical␣consistency␣to␣directly␣inform␣segmentation␣of␣latent␣steps␣across␣different␣time-steps.␣While␣our␣current␣formulation␣of␣physics␣is␣based␣on␣predefined␣rules,␣a␣differentiable␣neural␣network␣based␣physics␣engine~\cite{sanchez2018graph}␣can␣also␣be␣substituted␣on␣primitive␣states␣to␣represent␣more␣complex␣and␣variable␣physics.
     %\vspace{-5pt}-->    </subsection>
  </section>
  <section inlist="toc" xml:id="S4">
    <tags>
      <tag>4</tag>
      <tag role="autoref">section 4</tag>
      <tag role="refnum">4</tag>
      <tag role="typerefnum">§4</tag>
    </tags>
    <title><tag close=" ">4</tag>Evaluation</title>
<!--  %\vspace{-5pt} -->    <para xml:id="S4.p1">
      <p>We evaluate POD-Net on unsupervised object discovery in two different scenarios: a synthetic dataset consisting of various moving ShapeNet objects, and a real dataset of block towers falling. We also test how inferred 3D primitives can support more advanced physical reasoning.</p>
    </para>
    <subsection inlist="toc" labels="LABEL:sect:shapenet" xml:id="S4.SS1">
      <tags>
        <tag>4.1</tag>
        <tag role="autoref">subsection 4.1</tag>
        <tag role="refnum">4.1</tag>
        <tag role="typerefnum">§4.1</tag>
      </tags>
      <title><tag close=" ">4.1</tag>Moving ShapeNet</title>
<!--  %We␣begin␣with␣a␣video␣data␣set␣of␣moving␣ShapeNet␣objects. -->      <para xml:id="S4.SS1.p1">
        <p>We use ShapeNet objects to explore the ability of POD-Net to learn to segment objects from appearance and motion cues. We also test its ability to generalize to new shapes and textures.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:adept_qualitative" placement="t" xml:id="S4.F4">
        <tags>
          <tag><text fontsize="90%">Figure 4</text></tag>
          <tag role="autoref">Figure 4</tag>
          <tag role="refnum">4</tag>
          <tag role="typerefnum">Figure 4</tag>
        </tags>
<!--  %\begin{wrapfigure}{l}{0.5\textwidth} 
     %\vspace{-6mm}-->        <graphics candidates="fig/qual_pred_adept.pdf" class="ltx_centering" graphic="fig/qual_pred_adept.pdf" options="width=433.62pt" xml:id="S4.F4.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">4</tag><text fontsize="90%">Comparisons of unsupervised object segmentation of POD-Net with and without motion and with MONet on scenes with synthetic objects. MONet is unable to seperate individual instances of objects, but is capable of getting a foreground mask of objects in a scene. POD-Net (no physics) is able to reliably detect almost all objects, though some instances of objects are merged together into a single object. POD-Net is able to reliably detect separate objects even when they are mostly occluded (zoomed-in images on right).</text></toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 4</text></tag><text fontsize="90%">Comparisons of unsupervised object segmentation of POD-Net with and without motion and with MONet on scenes with synthetic objects. MONet is unable to seperate individual instances of objects, but is capable of getting a foreground mask of objects in a scene. POD-Net (no physics) is able to reliably detect almost all objects, though some instances of objects are merged together into a single object. POD-Net is able to reliably detect separate objects even when they are mostly occluded (zoomed-in images on right).</text></caption>
      </figure>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px1">
        <title>Data.</title>
        <para xml:id="S4.SS1.SSS0.Px1.p1">
          <p>To train models on moving ShapeNet objects, we use the generation code provided in the ADEPT dataset in <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite>. We generate a training set of 1,000 videos, each 100 frames long, of objects (80% of the objects from 44 ShapeNet categories) as well as rectangular occluders. Objects move in either a straight line, back and forth, or rotate, but do not collide with each other.</p>
        </para>
<!--  %The␣ADEPT␣dataset␣consists␣of␣1000␣videos␣each␣100␣frames␣each.␣Each␣video␣consists␣of␣a␣set␣of␣ShapeNet␣objects␣(from␣44␣different␣Shapenet␣categories)␣moving␣as␣well␣as␣an␣associated␣rectangular␣occluder.␣␣We␣generated␣3␣different -->      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px2">
        <title>Setup.</title>
        <para xml:id="S4.SS1.SSS0.Px2.p1">
          <p>Videos have a resolution of 1024<Math mode="inline" tex="\times" text="*" xml:id="S4.SS1.SSS0.Px2.p1.m1">
              <XMath>
                <XMTok meaning="times" role="MULOP">×</XMTok>
              </XMath>
            </Math>1024 pixels. We apply our model with a patch size of 256<Math mode="inline" tex="\times" text="*" xml:id="S4.SS1.SSS0.Px2.p1.m2">
              <XMath>
                <XMTok meaning="times" role="MULOP">×</XMTok>
              </XMath>
            </Math>256. We use a residual architecture <cite class="ltx_citemacro_citep">(<bibref bibrefs="He2015Deep" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite> for the attention and VAE components. Our backprojection model is pretrained on scenes of a single ShapeNet object, varied across different locations on a plane, with different rotations, translations, and scales. Our backprojection model only serves as a rough relative map from 2D mask to corresponding 3D position/size, as the dataset they are trained on utilize separate camera extrinsics/intrinsics than the ADEPT dataset, and also do not exhibit occlusions. To compute the physical plausibility <Math mode="inline" tex="L_{physics}" text="L _ (p * h * y * s * i * c * s)" xml:id="S4.SS1.SSS0.Px2.p1.m3">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">L</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">p</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">h</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">y</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">i</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">c</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">s</XMTok>
                  </XMApp>
                </XMApp>
              </XMath>
            </Math> of primitives, we utilize the observations from the last three time steps. For efficiency, we evaluate physical plausibility on each component sub-patch of image. We train a recurrent model with a total of 5 slots for each image. Image segmentation is trained and evaluated on a per frame basis.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px3">
        <title>Metrics.</title>
        <para xml:id="S4.SS1.SSS0.Px3.p1">
          <p>To quantify our results, we measure the intersection over union (IoU) between the predicted segmentation masks and the corresponding ground truth masks. We compute the IoU for each ground truth mask by finding the maximum IoU intersection with a predicted mask. We report the average IoU across all objects in an image, as well as the percentage of objects detected in an image (with IoU <Math mode="inline" tex="&gt;~{}0.5" text="absent &gt; 0.5" xml:id="S4.SS1.SSS0.Px3.p1.m1">
              <XMath>
                <XMApp>
                  <XMTok meaning="greater-than" role="RELOP" rpadding="3.3pt">&gt;</XMTok>
                  <XMTok meaning="absent"/>
                  <XMTok meaning="0.5" role="NUMBER">0.5</XMTok>
                </XMApp>
              </XMath>
            </Math>). To measure 3D inference ability, we report the maximum 3D IoU intersection between each ground truth 3D box and our inferred 3D bounding box. We also report the recall of ground truth 3D objects <cite class="ltx_citemacro_citep">(<bibref bibrefs="georgakis_rgbd" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite> detected in an image (with 3D IoU threshold <Math mode="inline" tex="&gt;~{}0.1" text="absent &gt; 0.1" xml:id="S4.SS1.SSS0.Px3.p1.m2">
              <XMath>
                <XMApp>
                  <XMTok meaning="greater-than" role="RELOP" rpadding="3.3pt">&gt;</XMTok>
                  <XMTok meaning="absent"/>
                  <XMTok meaning="0.1" role="NUMBER">0.1</XMTok>
                </XMApp>
              </XMath>
            </Math>). We utilize our backprojection model to extract 3D bounding box proposals from 2D segmentations and apply a linear transformation to align coordinate spaces.</p>
        </para>
<!--  %\begin{wraptable}{r}{.6\linewidth} -->        <table inlist="lot" labels="LABEL:tbl:iou_adept" placement="t" xml:id="S4.T1">
          <tags>
            <tag><text fontsize="90%">Table 1</text></tag>
            <tag role="autoref"><text fontsize="90%">Table 1</text></tag>
            <tag role="refnum"><text fontsize="90%">1</text></tag>
            <tag role="typerefnum"><text fontsize="90%">Table 1</text></tag>
          </tags>
          <tabular class="ltx_centering ltx_guessed_headers" colsep="3.0pt" vattach="middle">
            <thead>
              <tr>
                <td align="left" border="tt" thead="column row"><text font="bold" fontsize="90%">Model</text></td>
                <td align="center" border="tt" thead="column"><text font="bold" fontsize="90%">Multi-Scale</text></td>
                <td align="center" border="tt" thead="column"><text font="bold" fontsize="90%">Phys</text></td>
                <td align="center" border="tt" thead="column"><text font="bold" fontsize="90%">IoU</text></td>
                <td align="center" border="tt" thead="column"><text font="bold" fontsize="90%">Detection</text></td>
                <td align="center" border="tt" thead="column"><text font="bold" fontsize="90%">3D IOU</text></td>
                <td align="center" border="tt" thead="column"><text font="bold" fontsize="90%">3D Recall</text></td>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="left" border="t" thead="row"><text fontsize="90%">MONET</text></td>
                <td align="center" border="t"><text fontsize="90%">-</text></td>
                <td align="center" border="t"><text fontsize="90%">-</text></td>
                <td align="center" border="t"><text fontsize="90%">0.289 (0.007)</text></td>
                <td align="center" border="t"><text fontsize="90%">0.306 (0.005)</text></td>
                <td align="center" border="t"><text fontsize="90%">0.019 (0.007)</text></td>
                <td align="center" border="t"><text fontsize="90%">0.057 (0.028)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">OP3</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">0.145 (0.004)</text></td>
                <td align="center"><text fontsize="90%">0.121 (0.007)</text></td>
                <td align="center"><text fontsize="90%">0.001 (0.001)</text></td>
                <td align="center"><text fontsize="90%">0.000 (0.000)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">Norm. Cuts</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">0.634 (0.020)</text></td>
                <td align="center"><text fontsize="90%">0.768 (0.029)</text></td>
                <td align="center"><text fontsize="90%">0.034 (0.003)</text></td>
                <td align="center"><text fontsize="90%">0.042 (0.005)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">UVOD</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">0.129 (0.006)</text></td>
                <td align="center"><text fontsize="90%">0.062 (0.006)</text></td>
                <td align="center"><text fontsize="90%">0.001 (0.000)</text></td>
                <td align="center"><text fontsize="90%">0.000 (0.000)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">Crisp Boundary Detection</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">-</text></td>
                <td align="center"><text fontsize="90%">0.645 (0.012)</text></td>
                <td align="center"><text fontsize="90%">0.727 (0.020)</text></td>
                <td align="center"><text fontsize="90%">0.080 (0.004)</text></td>
                <td align="center"><text fontsize="90%">0.020 (0.001)</text></td>
              </tr>
              <tr>
                <td align="left" border="t" thead="row"><text fontsize="90%">POD-Net</text></td>
                <td align="center" border="t"><text fontsize="90%">No</text></td>
                <td align="center" border="t"><text fontsize="90%">No</text></td>
                <td align="center" border="t"><text fontsize="90%">0.314 (0.010)</text></td>
                <td align="center" border="t"><text fontsize="90%">0.361 (0.007)</text></td>
                <td align="center" border="t"><text fontsize="90%">0.052 (0.012)</text></td>
                <td align="center" border="t"><text fontsize="90%">0.171 (0.012)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">POD-Net</text></td>
                <td align="center"><text fontsize="90%">No</text></td>
                <td align="center"><text fontsize="90%">Yes</text></td>
                <td align="center"><text fontsize="90%">0.462 (0.007)</text></td>
                <td align="center"><text fontsize="90%">0.512 (0.009)</text></td>
                <td align="center"><text fontsize="90%">0.071 (0.012)</text></td>
                <td align="center"><text fontsize="90%">0.287 (0.016)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">POD-Net</text></td>
                <td align="center"><text fontsize="90%">Yes</text></td>
                <td align="center"><text fontsize="90%">No</text></td>
                <td align="center"><text fontsize="90%">0.649 (0.011)</text></td>
                <td align="center"><text fontsize="90%">0.709 (0.016)</text></td>
                <td align="center"><text fontsize="90%">0.068 (0.011)</text></td>
                <td align="center"><text fontsize="90%">0.251 (0.014)</text></td>
              </tr>
              <tr>
                <td align="left" thead="row"><text fontsize="90%">POD-Net (Manual)</text></td>
                <td align="center"><text fontsize="90%">Yes</text></td>
                <td align="center"><text fontsize="90%">Yes</text></td>
                <td align="center"><text fontsize="90%">0.685 (0.017)</text></td>
                <td align="center"><text fontsize="90%">0.760 (0.016)</text></td>
                <td align="center"><text fontsize="90%">0.090 (0.015)</text></td>
                <td align="center"><text fontsize="90%">0.328 (0.016)</text></td>
              </tr>
              <tr>
                <td align="left" border="bb" thead="row"><text fontsize="90%">POD-Net</text></td>
                <td align="center" border="bb"><text fontsize="90%">Yes</text></td>
                <td align="center" border="bb"><text fontsize="90%">Yes</text></td>
                <td align="center" border="bb"><text font="bold" fontsize="90%">0.739 (0.011)</text></td>
                <td align="center" border="bb"><text font="bold" fontsize="90%">0.821 (0.015)</text></td>
                <td align="center" border="bb"><text font="bold" fontsize="90%">0.095 (0.012)</text></td>
                <td align="center" border="bb"><text font="bold" fontsize="90%">0.374 (0.017)</text></td>
              </tr>
            </tbody>
          </tabular>
<!--  %****␣iou_adept.tex␣Line␣25␣**** -->          <toccaption class="ltx_centering"><tag close=" "><text fontsize="90%">1</text></tag><text fontsize="90%">Average IoU of segmentations on the ADEPT dataset and the proportion of objects detected, where one segmentation mask has greater than 0.5 IoU, as well as average 3D IoU and recall. Standard error in parentheses.</text></toccaption>
          <caption class="ltx_centering" fontsize="90%"><tag close=": ">Table 1</tag>Average IoU of segmentations on the ADEPT dataset and the proportion of objects detected, where one segmentation mask has greater than 0.5 IoU, as well as average 3D IoU and recall. Standard error in parentheses.</caption>
<!--  %\end{wraptable} -->        </table>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px4">
        <title>Baselines.</title>
        <para xml:id="S4.SS1.SSS0.Px4.p1">
          <p>We compare with two recent models of self-supervised object discovery, OP3 <cite class="ltx_citemacro_citep">(<bibref bibrefs="veerapaneni2019entity" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite> and MONet <cite class="ltx_citemacro_citep">(<bibref bibrefs="burgess2019monet" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>, as well as three algorithms for object segmentation, Normalized Cuts <cite class="ltx_citemacro_citep">(<bibref bibrefs="Shi2000Normalized" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>, Crisp Boundary  <cite class="ltx_citemacro_citep">(<bibref bibrefs="isola_zoran_krishnan_adelson_2014" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>, and the recent UVOD <cite class="ltx_citemacro_citep">(<bibref bibrefs="yang2019unsupervised" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>. We train OP3 with 7 slots, with 4 steps of optimization per mask in the first image, and an additional step of optimization per future timestep. Due to memory constraints, we were only able to train the OP3 model on inputs of size 128 by 128. We train MONet on inputs of size 256 by 256. We apply normalized cuts on a region adjacency graph of the 256 by 256 image, and train UVOD in 256 by 256 images . <!--  %We␣use␣an␣public␣reimplementation␣of␣MONet␣on␣Github.\footnote{https://github.com/baudm/MONet-pytorch} -->We also compare with ablations of POD-Net: POD-Net applied directly to an image (single-scale) as opposed to across patches (multi-scale), POD-Net without physics, and POD-Net with a hard-coded backprojection model (‘Manual’) .
<!--  %****␣experiments.tex␣Line␣25␣**** --></p>
        </para>
<!--  %\input{figText/motion_generalize.tex} 
     %\begin{figure}[t]
     %%\vspace{-20pt}
     %\begin{minipage}[b]{0.34\textwidth}
     %\centering
     %%␣\begin{wrapfigure}{l}{0.5\textwidth}
     %%␣\vspace{-6mm}
     %\includegraphics[width=\linewidth]{fig/both_3d.pdf}
     %%␣\vspace{-15pt}
     %\caption{Plot␣of␣predicted␣translation␣of␣␣3D␣primitive␣vs␣ground␣truth␣translation␣of␣3D␣primitives␣(top)␣and␣plot␣of␣predicted␣scale␣of␣3D␣primitive␣vs␣ground␣scale␣of␣3D␣primitive.␣}
     %\label{fig:scatter_3d}
     %\end{minipage}\hfill
     %\begin{minipage}[b]{0.64\textwidth}
     %\centering
     %%␣\begin{wrapfigure}{l}{0.5\textwidth}
     %%␣\vspace{-6mm}
     %\includegraphics[width=\linewidth]{fig/3d_primitive_cube.pdf}
     %%␣\vspace{-15pt}
     %\caption{Visualization␣of␣discovered␣3D␣primitive␣in␣two␣different␣scenes␣(top␣and␣bottom)␣through␣time.␣Our␣model␣is␣able␣to␣discover␣a␣3D␣shape,␣that␣is␣consistent␣with␣observed␣inputs␣under␣a␣perspective␣map.␣Furthermore,␣discovered␣primitive␣move␣coherently␣through␣time.}
     %\label{fig:cubes_3d}
     %\end{minipage}
     %\vspace{-10pt}
     %\end{figure}-->        <figure inlist="lof" labels="LABEL:fig:cubes_3d" placement="t" xml:id="S4.F5">
          <tags>
            <tag><text fontsize="90%">Figure 5</text></tag>
            <tag role="autoref">Figure 5</tag>
            <tag role="refnum">5</tag>
            <tag role="typerefnum">Figure 5</tag>
          </tags>
<!--  %****␣3d_primitive_cube.tex␣Line␣25␣**** 
     %\begin{wrapfigure}{l}{0.5\textwidth}
     %\vspace{-6mm}-->          <graphics candidates="fig/3d_primitive_cube.pdf" class="ltx_centering" graphic="fig/3d_primitive_cube.pdf" options="width=433.62pt" xml:id="S4.F5.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">5</tag><text fontsize="90%">Visualization of discovered 3D primitive in two different scenes (top and bottom) through time. Our model is able to discover a 3D shape that is consistent with observed inputs under a perspective map. Furthermore, discovered primitive move coherently through time.</text></toccaption>
          <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 5</text></tag><text fontsize="90%">Visualization of discovered 3D primitive in two different scenes (top and bottom) through time. Our model is able to discover a 3D shape that is consistent with observed inputs under a perspective map. Furthermore, discovered primitive move coherently through time.</text></caption>
        </figure>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px5">
        <title>Results.</title>
        <para xml:id="S4.SS1.SSS0.Px5.p1">
          <p>We quantitatively compare object masks discovered by our model and other baselines in Table <ref labelref="LABEL:tbl:iou_adept"/>. We find that OP3 performs poorly, as it only discovers a limited subset of objects. <!--  %(perhaps␣partially␣due␣to␣reduced␣input␣resolution).␣We␣find␣that -->MONet performs better and is able to discover a single foreground mask of all objects. However, the masks are not decomposed into separate component objects in a scene (Figure <ref labelref="LABEL:fig:adept_qualitative"/>, 2nd row). Our scenes consist of a variable set of objects of vastly different scales, making it hard for MONet to learn to assign individual slots for each object. We find that a baseline based on normalizing cuts/crisp boundary detection is also able to segment objects, but is unable to get sharp segmentation boundaries for each object, and often decomposes a single object into multiple subobjects (see Appendix <ref labelref="LABEL:app:nc"/> for details). Finally, UVOD also only segments a single foreground object.</p>
        </para>
        <figure inlist="lof" labels="LABEL:fig:scatter_3d" placement="t" xml:id="S4.F6">
          <tags>
            <tag><text fontsize="90%">Figure 6</text></tag>
            <tag role="autoref">Figure 6</tag>
            <tag role="refnum">6</tag>
            <tag role="typerefnum">Figure 6</tag>
          </tags>
<!--  %\vspace{-6mm} -->          <inline-para align="center" class="ltx_minipage" vattach="middle" width="299.2pt">
            <para xml:id="S4.F6.p1">
              <graphics candidates="fig/both_3d.pdf" graphic="fig/both_3d.pdf" options="width=433.62pt" xml:id="S4.F6.p1.g1"/>
            </para>
          </inline-para>
          <toccaption class="ltx_centering"><tag close=" ">6</tag><text fontsize="90%">Plot of predicted translation of 3D primitive vs ground truth translation of 3D primitives (top) and plot of predicted scale of 3D primitive vs ground scale of 3D primitive. </text></toccaption>
          <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 6</text></tag><text fontsize="90%">Plot of predicted translation of 3D primitive vs ground truth translation of 3D primitives (top) and plot of predicted scale of 3D primitive vs ground scale of 3D primitive. </text></caption>
        </figure>
        <para xml:id="S4.SS1.SSS0.Px5.p2">
          <p>We find that applying POD-Net (single scale, no physics) improves on MONet slightly, discovering several different masks containing multiple objects, albeit sometime missing other objects. POD-Net (single scale, physics) more reliably segments separate objects, but still misses objects. POD-Net (multi scale, no physics) reliably segments all objects in the scene, but often merges multiple objects into one object, especially when objects are overlapping (e.g., Figure <ref labelref="LABEL:fig:adept_qualitative"/>, 3rd row). Finally, POD-Net obtains the best performance and segments all objects in the scene and individual objects where multiple objects overlap with each other (Figure <ref labelref="LABEL:fig:adept_qualitative"/>, 4th row). Utilizing a manually coded backprojection module, POD-Net (‘Manual’) only leads to slight degradation in performance.</p>
        </para>
<!--  %\jw{Let’s␣make␣sure␣the␣figures␣and␣tables␣appear␣in␣the␣same␣order␣as␣they␣appear␣in␣text.␣Also␣please␣re-arrange␣them␣so␣that␣they␣are␣close␣to␣the␣descriptions.} 
     %\jw{And␣Fig␣4␣(predicted␣translation)␣seems␣never␣discussed?}
     %We␣provide␣qualitative␣visualizations␣of␣model’s␣predictions␣in␣\fig{fig:adept_qualitative}.␣Given␣an␣input␣image,␣we␣find␣that␣MONet␣is␣able␣to␣reliably␣segment␣the␣foreground␣and␣background,␣but␣considers␣all␣foreground␣objects␣as␣a␣single␣object.␣\model␣with␣multiscale␣signal␣is␣able␣to␣segment␣all␣foreground␣objects,␣but␣still␣often␣merges␣several␣foreground␣objects␣as␣a␣single␣object␣when␣they␣are␣overlapping.␣\model␣with␣multiscale␣and␣physical␣consistency␣is␣able␣to␣segment␣all␣foreground␣objects,␣while␣also␣distinguishing␣those␣that␣overlap.␣However,␣\model␣with␣multiscale␣and␣physical␣consistency␣has␣the␣weakness␣that␣it␣exhibits␣over-segmentation,␣and␣sometimes␣segments␣sharp␣shadows␣as␣another␣different␣object.
     %\input{figText/both_3d}-->        <para xml:id="S4.SS1.SSS0.Px5.p3">
          <p>Next we analyze the 3D objects discovered by POD-Net. In Table <ref labelref="LABEL:tbl:iou_adept"/>, POD-Net performs the best, achieving the highest 3D average IoU intersection and recall. Crisp boundary detection obtains a high average IoU but low recall due to a large number of proposals. All IoUs are low due to the challenging nature of the task – obtaining high 3D IoU requires correct regression of size, position, and depth (using only RGB inputs). Even recent supervised 3D reconstruction approaches using 0.25 IoU thresholds for evaluation <cite class="ltx_citemacro_citep">(<bibref bibrefs="dopscvpr" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>. Visualizations of the discovered objects in Figure <ref labelref="LABEL:fig:cubes_3d"/> show that POD-Net is able to segment a scene into a set of <text font="italic">temporally consistent</text> 3D cuboid primitives. We further find a high correlation <Math mode="inline" tex="r=0.615" text="r = 0.615" xml:id="S4.SS1.SSS0.Px5.p3.m1">
              <XMath>
                <XMApp>
                  <XMTok meaning="equals" role="RELOP">=</XMTok>
                  <XMTok font="italic" role="UNKNOWN">r</XMTok>
                  <XMTok meaning="0.615" role="NUMBER">0.615</XMTok>
                </XMApp>
              </XMath>
            </Math> between the predicted and ground truth translation of objects and show plots of correlations in Figure <ref labelref="LABEL:fig:scatter_3d"/> of ground truth and predicted objects.
<!--  %analyze␣correlation␣between␣ground␣truth␣and␣inferred␣size␣and␣locations␣of␣primitives␣in␣the␣appendix. --></p>
        </para>
<!--  %\fig{fig:scatter_3d}␣shows␣a␣plot␣of␣predicted␣displacements␣of␣discovered␣3D␣objects␣with␣ground-truth␣object␣displacements.␣It␣shows␣a␣plot␣of␣the␣predicted␣scale␣of␣discovered␣3D␣objects␣with␣ground␣truth.␣␣The␣3D␣objects␣found␣by␣\model␣have␣good␣correlation␣with␣ground␣truth␣3D␣object␣annotations. -->        <figure inlist="lof" labels="LABEL:fig:adept_generalize" placement="t" xml:id="S4.F7">
          <tags>
            <tag><text fontsize="90%">Figure 7</text></tag>
            <tag role="autoref"><text fontsize="90%">Figure 7</text></tag>
            <tag role="refnum"><text fontsize="90%">7</text></tag>
            <tag role="typerefnum"><text fontsize="90%">Figure 7</text></tag>
          </tags>
<!--  %\begin{wrapfigure}{l}{0.5\textwidth} 
     %\vspace{-6mm}-->          <graphics candidates="fig/adept_qual.pdf" class="ltx_centering" graphic="fig/adept_qual.pdf" options="width=433.62pt" xml:id="S4.F7.g1"/>
          <inline-para align="center" class="ltx_minipage" vattach="bottom" width="212.5pt">
            <para align="center" xml:id="S4.F7.p1">
              <tabular class="ltx_guessed_headers" vattach="middle">
                <thead>
                  <tr>
                    <td align="left" border="tt" thead="column row"><text fontsize="90%">Model</text></td>
                    <td align="center" border="tt" thead="column"><text fontsize="90%">Phys</text></td>
                    <td align="center" border="tt" thead="column"><text fontsize="90%">IoU</text></td>
                    <td align="center" border="tt" thead="column"><text fontsize="90%">Detection</text></td>
                  </tr>
                </thead>
                <tbody>
                  <tr>
                    <td align="left" border="bb t" rowspan="2" thead="row"><text fontsize="90%">POD-Net</text></td>
                    <td align="center" border="t"><text fontsize="90%">No</text></td>
                    <td align="center" border="t"><text fontsize="90%">0.768</text></td>
                    <td align="center" border="t"><text fontsize="90%">0.823</text></td>
                  </tr>
                  <tr>
                    <td align="center" border="bb"><text fontsize="90%">Yes</text></td>
                    <td align="center" border="bb"><text fontsize="90%">0.857</text></td>
                    <td align="center" border="bb"><text fontsize="90%">0.922</text></td>
                  </tr>
                </tbody>
              </tabular>
            </para>
          </inline-para>
          <inline-para align="center" class="ltx_minipage" vattach="bottom" width="212.5pt">
            <para align="center" xml:id="S4.F7.p2">
              <tabular class="ltx_guessed_headers" vattach="middle">
                <thead>
                  <tr>
                    <td align="left" border="tt" thead="column row"><text fontsize="90%">Model</text></td>
                    <td align="center" border="tt" thead="column"><text fontsize="90%">Phys</text></td>
                    <td align="center" border="tt" thead="column"><text fontsize="90%">IoU</text></td>
                    <td align="center" border="tt" thead="column"><text fontsize="90%">Detection</text></td>
                  </tr>
                </thead>
                <tbody>
                  <tr>
                    <td align="left" border="bb t" rowspan="2" thead="row"><text fontsize="90%">POD-Net</text></td>
                    <td align="center" border="t"><text fontsize="90%">No</text></td>
                    <td align="center" border="t"><text fontsize="90%">0.658</text></td>
                    <td align="center" border="t"><text fontsize="90%">0.716</text></td>
                  </tr>
                  <tr>
                    <td align="center" border="bb"><text fontsize="90%">Yes</text></td>
                    <td align="center" border="bb"><text fontsize="90%">0.756</text></td>
                    <td align="center" border="bb"><text fontsize="90%">0.843</text></td>
                  </tr>
                </tbody>
              </tabular>
            </para>
          </inline-para>
          <toccaption class="ltx_centering"><tag close=" "><text fontsize="90%">7</text></tag><text fontsize="90%">Generalization to novel objects and colors. Top: POD-Net successfully segments individual objects, except when colors bisect an object (row 2, column 7). Bottom: Evaluation of POD-Net’s generalization with or without physical constancy, measured in average IoUs on segmentations and in the percentage of objects that are detected. Including physics integrates the motion signal and generalizes better in both cases.</text></toccaption>
          <caption class="ltx_centering" fontsize="90%"><tag close=": ">Figure 7</tag>Generalization to novel objects and colors. Top: POD-Net successfully segments individual objects, except when colors bisect an object (row 2, column 7). Bottom: Evaluation of POD-Net’s generalization with or without physical constancy, measured in average IoUs on segmentations and in the percentage of objects that are detected. Including physics integrates the motion signal and generalizes better in both cases.</caption>
        </figure>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS1.SSS0.Px6">
        <title>Generalization.</title>
        <para xml:id="S4.SS1.SSS0.Px6.p1">
          <p>Just as young children can detect and reason about new objects with arbitrary shapes and colors, we test
<!--  %****␣experiments.tex␣Line␣50␣**** 
     %Motion␣with␣physical␣consistency␣is␣a␣ubiquitous␣signal␣used␣in␣early␣childhood␣development␣to␣discover␣objects,␣and␣is␣readily␣transfer␣across␣all␣different␣shapes/textures␣of␣objects.␣We␣thus␣further␣test␣how␣well␣physical␣consistency␣can␣help␣the␣generalization␣of␣our␣model-->how well POD-Net can generalize to scenes with both novel objects and colors. We evaluate the generalization of our model on two datasets: a novel object dataset consisting of 20 new objects and a novel color dataset, where each object is split into two colors.
<!--  %\begin{itemize}[leftmargin=*] 
     %\vspace{-3pt}\item␣Novel␣objects:␣We␣use␣the␣test␣set␣in␣\citet{smith2019modeling},␣consisting␣of␣the␣20\%␣novel␣objects␣from␣44␣ShapeNet␣categories,␣objects␣from␣another␣11␣ShapeNet␣categories␣not␣in␣the␣training␣dataset,␣and␣common␣developmental␣psychology␣objects␣such␣as␣toy␣ducks.
     %\vspace{-3pt}\item␣Novel␣colors:␣We␣generated␣a␣dataset␣with␣object␣distribution␣the␣same␣as␣the␣original␣video␣dataset,␣but␣each␣object␣is␣split␣into␣two␣separate␣colors.
     %\end{itemize}
     %%@Kevin,␣let’s␣avoid␣adding␣too␣much␣text␣as␣we’re␣really␣out␣of␣space.␣@Jiajun:␣I’m␣trying␣to␣confine␣things␣to␣stay␣on␣the␣same␣line␣where␣possible...␣will␣go␣back␣through␣after␣too
     %%thanks!␣sounds␣good.
     %\vspace{-5pt}--></p>
        </para>
<!--  %\model␣can␣provide␣interpretable␣segmentations␣of␣objects. -->        <para xml:id="S4.SS1.SSS0.Px6.p2">
          <p>Figure <ref labelref="LABEL:fig:adept_generalize"/> shows a quantitative analysis of POD-Net applied to both novel objects and colors. We find that in both settings, POD-Net with physical consistency gets better segmentation than without. Performance is higher here compared to that reported on the training set, as both novel datasets contain fewer objects in a single scene. Qualitatively, POD-Net performs well when asked to discover novel objects, although it can mistake a multicolored novel shape to be two objects.</p>
        </para>
<!--  %\vspace{-5pt} -->      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="S4.SS2">
      <tags>
        <tag>4.2</tag>
        <tag role="autoref">subsection 4.2</tag>
        <tag role="refnum">4.2</tag>
        <tag role="typerefnum">§4.2</tag>
      </tags>
      <title><tag close=" ">4.2</tag>Real Block Towers</title>
      <figure float="right" inlist="lof" labels="LABEL:fig:cubes_qualitative" xml:id="S4.F8">
        <tags>
          <tag><text fontsize="90%">Figure 8</text></tag>
          <tag role="autoref"><text fontsize="90%">Figure 8</text></tag>
          <tag role="refnum"><text fontsize="90%">8</text></tag>
          <tag role="typerefnum"><text fontsize="90%">Figure 8</text></tag>
        </tags>
        <tabular class="ltx_centering ltx_guessed_headers" colsep="3.5pt" vattach="middle">
          <tbody>
            <tr>
              <td align="left" border="tt" thead="row"><text fontsize="90%">Model</text></td>
              <td align="center" border="tt"><text fontsize="90%">Multi-Scale</text></td>
              <td align="center" border="tt"><text fontsize="90%">Phys</text></td>
              <td align="center" border="tt"><text fontsize="90%">IoU</text></td>
              <td align="center" border="tt"><text fontsize="90%">Detection</text></td>
            </tr>
            <tr>
              <td align="left" border="t" thead="row"><text fontsize="90%">MONET</text></td>
              <td align="center" border="t"><text fontsize="90%">-</text></td>
              <td align="center" border="t"><text fontsize="90%">-</text></td>
              <td align="center" border="t"><text fontsize="90%">0.521 (0.005)</text></td>
              <td align="center" border="t"><text fontsize="90%">0.537 (0.003)</text></td>
            </tr>
            <tr>
              <td align="left" thead="row"><text fontsize="90%">OP3</text></td>
              <td align="center"><text fontsize="90%">-</text></td>
              <td align="center"><text fontsize="90%">-</text></td>
              <td align="center"><text fontsize="90%">0.311 (0.004)</text></td>
              <td align="center"><text fontsize="90%">0.250 (0.007)</text></td>
            </tr>
            <tr>
              <td align="left" thead="row"><text fontsize="90%">Norm. Cuts</text></td>
              <td align="center"><text fontsize="90%">-</text></td>
              <td align="center"><text fontsize="90%">-</text></td>
              <td align="center"><text fontsize="90%">0.652 (0.006)</text></td>
              <td align="center"><text fontsize="90%">0.849 (0.018)</text></td>
            </tr>
            <tr>
              <td align="left" thead="row"><text fontsize="90%">UVOD</text></td>
              <td align="center"><text fontsize="90%">-</text></td>
              <td align="center"><text fontsize="90%">-</text></td>
              <td align="center"><text fontsize="90%">0.029 (0.001)</text></td>
              <td align="center"><text fontsize="90%">0.0 (0.0)</text></td>
            </tr>
            <tr>
              <td align="left" border="t" thead="row"><text fontsize="90%">POD-Net</text></td>
              <td align="center" border="t"><text fontsize="90%">No</text></td>
              <td align="center" border="t"><text fontsize="90%">No</text></td>
              <td align="center" border="t"><text fontsize="90%">0.546 (0.004)</text></td>
              <td align="center" border="t"><text fontsize="90%">0.523 (0.006)</text></td>
            </tr>
            <tr>
              <td align="left" thead="row"><text fontsize="90%">POD-Net</text></td>
              <td align="center"><text fontsize="90%">Yes</text></td>
              <td align="center"><text fontsize="90%">No</text></td>
              <td align="center"><text fontsize="90%">0.734 (0.012)</text></td>
              <td align="center"><text fontsize="90%">0.761 (0.008)</text></td>
            </tr>
            <tr>
              <td align="left" border="bb" thead="row"><text fontsize="90%">POD-Net</text></td>
              <td align="center" border="bb"><text fontsize="90%">Yes</text></td>
              <td align="center" border="bb"><text fontsize="90%">Yes</text></td>
              <td align="center" border="bb"><text font="bold" fontsize="90%">0.837 (0.004)</text></td>
              <td align="center" border="bb"><text font="bold" fontsize="90%">0.908 (0.008)</text></td>
            </tr>
          </tbody>
        </tabular>
        <graphics candidates="fig/qual_pred_block.pdf" class="ltx_centering" graphic="fig/qual_pred_block.pdf" options="width=433.62pt" xml:id="S4.F8.g1"/>
        <toccaption class="ltx_centering"><tag close=" "><text fontsize="90%">8</text></tag><text fontsize="90%">Top: IoU of segmentation results on the real blocks dataset and the percentage of objects detected. Bottom: Qualitative comparisons of unsupervised object segmentation of POD-Net with and without physics and with MONet on realistic block towers. MONet often groups two blocks of similar color (dark blue/green) together and sometimes misses particular blocks. POD-Net without physics reliably detects all blocks, but still groups similar blocks (dark blue/green) into one. POD-Net with physics detects all objects and assigns different masks to each. Standard error in parentheses.</text></toccaption>
        <caption class="ltx_centering" fontsize="90%"><tag close=": ">Figure 8</tag>Top: IoU of segmentation results on the real blocks dataset and the percentage of objects detected. Bottom: Qualitative comparisons of unsupervised object segmentation of POD-Net with and without physics and with MONet on realistic block towers. MONet often groups two blocks of similar color (dark blue/green) together and sometimes misses particular blocks. POD-Net without physics reliably detects all blocks, but still groups similar blocks (dark blue/green) into one. POD-Net with physics detects all objects and assigns different masks to each. Standard error in parentheses.</caption>
<!--  %****␣iou_cube_tbl_qual.tex␣Line␣25␣**** -->      </figure>
      <para xml:id="S4.SS2.p1">
        <p>Next, we evaluate how POD-Net segments and detects objects in real videos.
<!--  %on␣a␣realistic␣video␣dataset␣consisting␣of␣block␣towers.␣We␣explore␣the␣ability␣of␣our␣model␣to␣learn␣to␣segment␣objects␣and␣detect␣underlying␣primitives␣in␣the␣realistic␣scenario␣also. --></p>
      </para>
      <paragraph inlist="toc" xml:id="S4.SS2.SSS0.Px1">
        <title>Data.</title>
        <para xml:id="S4.SS2.SSS0.Px1.p1">
          <p>We use the dataset in <cite class="ltx_citemacro_citet"><bibref bibrefs="Lerer2016Learning" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite> with 492 videos of real block towers, which may or may not be falling. Each frame contains 2 to 4 blocks of red, yellow, or green color. Each block has the same 3D shape, although the 2D projections on the camera differ.</p>
        </para>
<!--  %****␣experiments.tex␣Line␣75␣**** -->      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS2.SSS0.Px2">
        <title>Setup.</title>
        <para xml:id="S4.SS2.SSS0.Px2.p1">
          <p>For our backprojection model, we use a pretrained neural network on scenes of a single block at different heights, sizes, and varying distances. Similar to Section <ref labelref="LABEL:sect:shapenet"/>, the backprojection model serves as a rough 2D to 3D model and is trained with different camera and perspective parameters without occlusion. All other settings are the same as in Section <ref labelref="LABEL:sect:shapenet"/>.</p>
        </para>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS2.SSS0.Px3">
        <title>Results.</title>
        <para xml:id="S4.SS2.SSS0.Px3.p1">
          <p>We compare masks discovered by POD-Net and baselines in Figure <ref labelref="LABEL:fig:cubes_qualitative"/>. We find that OP3 and MONet often misses blocks and also groups two blocks into a single object, leading to floating blocks in the air (Figure <ref labelref="LABEL:fig:cubes_qualitative"/>, 2nd row). Normalized cuts also suffers from a similar issue of grouping blocks, but suffers an additional issue of oversegmentation (see Appendix <ref labelref="LABEL:app:nc"/>). UVOD fails to predict a segmentation due to limited motion in video. POD-Net (single scale, no physics) is able to segment all blocks, but treats the entire stack as a single object. POD-Net (multi-scale, no physics) does better and is able to reliably segment all blocks, though it still groups blocks of similar colors together (Figure <ref labelref="LABEL:fig:cubes_qualitative"/>, 3rd row). Finally, POD-Net with multiple scales and physical consistency performs the best, reliably separating individual blocks in a tower (Figure <ref labelref="LABEL:fig:cubes_qualitative"/>, 4th row).</p>
        </para>
<!--  %Qualitatively,␣\fig{fig:cubes_qualitative}␣shows␣a␣qualitative␣comparison␣between␣different␣methods.␣Without␣physical␣constancy,␣both␣MONet␣and␣\model␣multiscale␣tend␣to␣group␣dark␣green␣and␣blue␣cubes␣together␣into␣a␣single␣block,␣while␣\model␣multiscale␣with␣physical␣consistency␣is␣able␣to␣reliably␣separate␣all␣different␣blocks. 
     %\input{figText/iou_cubes.tex}
     %\input{figText/cubes_qualitative.tex}
     %\vspace{-5pt}-->      </paragraph>
    </subsection>
    <subsection inlist="toc" xml:id="S4.SS3">
      <tags>
        <tag>4.3</tag>
        <tag role="autoref">subsection 4.3</tag>
        <tag role="refnum">4.3</tag>
        <tag role="typerefnum">§4.3</tag>
      </tags>
      <title><tag close=" ">4.3</tag>Judging Physical Plausibility</title>
<!--  %We␣investigate␣the␣ability␣of␣our␣framework␣to␣estimate␣the␣physical␣plausibility␣of␣scenes␣using␣discovered␣3D␣objects␣in␣line␣with␣human␣judgement. -->      <para xml:id="S4.SS3.p1">
        <p>We test whether POD-Net can discover objects reliably enough to perform the physical violation detection task of <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>, in which videos that have non-physical events (objects disappearing or teleporting) must be differentiated from plausible videos.</p>
      </para>
      <para xml:id="S4.SS3.p2">
        <p>We consider two separate tasks: the Overturn (Long) task, which consists of a plane overlaying an object, and the Block task, which consists of physical scenes with a solid wall and an object moving towards the wall, where it may either appear to hit the wall and stop or appear on the other side. To successfully perform the Overturn task, POD-Net must reason about object permanence, while to accomplish the block task, the system must remember object states across a large number of timesteps and understand both spatial continuity and object permanence.
<!--  %\myparagraph{Data.}␣␣\citet{smith2019modeling}␣introduced␣a␣test␣set␣of␣videos␣representing␣common␣psychologically␣surprising␣scenes␣to␣humans. --></p>
      </para>
<!--  %Such␣scenes␣evaluate␣core␣object␣properties␣such␣as␣permanence␣(objects␣do␣not␣appear␣or␣disappear␣for␣no␣reason),␣continuity␣(objects␣move␣along␣connected␣trajectories),␣and␣solidity␣(objects␣can␣not␣move␣through␣each␣other).␣To␣test␣a␣combination␣of␣all␣these␣concepts,␣we␣evaluate␣how␣well␣a␣model␣with␣\model␣in␣the␣loop␣performs␣prediction␣on␣the␣␣‘Overturn␣(Long)’␣and␣‘Block’␣tasks␣in␣the␣ADEPT␣benchmark. -->      <paragraph inlist="toc" xml:id="S4.SS3.SSS0.Px1">
        <title>Setup.</title>
        <para xml:id="S4.SS3.SSS0.Px1.p1">
          <p>We use POD-Net trained in Section <ref labelref="LABEL:sect:shapenet"/> to obtain a set of physical objects (represented as cuboids) describing an underlying scene.
<!--  %Given␣a␣set␣of␣discovered␣physical␣objects␣(as␣parameterized␣by␣size,␣position,␣rotation),␣to␣compute␣the␣physical␣feasibility␣of␣a␣scene,␣we -->The extracted objects are provided as a scene description to the stochastic physics engine and particle filter described in <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite>. We evaluate our models using a relative accuracy metric <cite class="ltx_citemacro_citep">(<bibref bibrefs="Riochet2018IntPhys" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
                <bibrefphrase>, </bibrefphrase>
              </bibref>)</cite>.
<!--  %****␣experiments.tex␣Line␣100␣**** --></p>
        </para>
<!--  %:␣given␣$n$␣pairs␣of␣videos␣with␣surprising␣scenes␣$\textbf{x}^+$␣and␣control␣scenes␣$\textbf{x}^-$,␣we␣report␣the␣proportion␣of␣correctly␣ordered␣scene␣pairs␣such␣that␣the␣violation␣scene␣is␣judged␣more␣surprising␣than␣a␣matched␣control␣scene␣without␣a␣violation␣$\sum_{i,j}␣[c(\textbf{x}_i^+)␣&gt;␣c(\textbf{x}_j^-)]␣␣/␣n$.␣We␣evaluate␣our␣model␣on␣189␣scene␣pairs. 
     %We␣use␣a␣particle␣filter␣to␣maintain␣a␣set␣of␣beliefs␣states␣over␣the␣physical␣objects,␣and␣measure␣surprisal␣between␣current␣observations␣from␣\model␣with␣those␣in␣the␣belief␣state␣following~\citet{smith2019modeling}.-->        <figure inlist="lof" labels="LABEL:fig:surprisal" placement="t" xml:id="S4.F9">
          <tags>
            <tag><text fontsize="90%">Figure 9</text></tag>
            <tag role="autoref">Figure 9</tag>
            <tag role="refnum">9</tag>
            <tag role="typerefnum">Figure 9</tag>
          </tags>
          <inline-para align="center" class="ltx_minipage" vattach="middle" width="294.9pt">
            <para xml:id="S4.F9.p1">
              <graphics candidates="fig/surprisal.pdf" graphic="fig/surprisal.pdf" options="width=433.62pt" xml:id="S4.F9.p1.g1"/>
            </para>
          </inline-para>
          <toccaption class="ltx_centering"><tag close=" ">9</tag>Surprise over time in a ‘Block’ scene. POD-Net has relatively low surprisal throughout most of the video. But when the occluder falls and the object appears to ‘teleport’ across the wall, POD-Net recognizes this abnormal shift in position and becomes surprised.</toccaption>
          <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 9</text></tag><text fontsize="90%">Surprise over time in a ‘Block’ scene. POD-Net has relatively low surprisal throughout most of the video. But when the occluder falls and the object appears to ‘teleport’ across the wall, POD-Net recognizes this abnormal shift in position and becomes surprised.</text></caption>
        </figure>
      </paragraph>
      <paragraph inlist="toc" xml:id="S4.SS3.SSS0.Px2">
        <title>Results.</title>
        <para xml:id="S4.SS3.SSS0.Px2.p1">
          <p>On the Block task, we find that our model achieves a relative accuracy of 0.622. Its performance on a single video can be seen in Figure <ref labelref="LABEL:fig:surprisal"/>, where it has learned to localize the block well enough that the model is surprised when it appears on the other side of the wall. The model in <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite> scores a relative accuracy of 0.680; this acts as an upper bound for the performance of our model, since supervised training is used to discover the object masks and recover object properties. In contrast, POD-Net discovers 3D objects in an unsupervised manner, outperforming the baseline generative models studied by <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite> that do not encode biases for objecthood (GAN: 0.44, Encoder-Decoder: 0.52, LSTM: 0.44). On the Overturn (Long) task – the one task where the ADEPT model underperforms baselines – our model obtains a performance of 0.77, outperforming <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite> (0.73), and equalling or exceeding models that do not encode biases for objects (GAN: 0.81, Encoder-Decoder: 0.61, LSTM: 0.63).</p>
        </para>
        <para xml:id="S4.SS3.SSS0.Px2.p2">
          <p>A limitation of our approach towards discovering 3D object primitives is that across a long video (over 100 timesteps), there may be several spurious extraneous objects discovered. The model in <cite class="ltx_citemacro_citet"><bibref bibrefs="smith2019modeling" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
                <bibrefphrase>(</bibrefphrase>
                <bibrefphrase>)</bibrefphrase>
              </bibref></cite> does not deal well with such spurious detections either, requiring us to tune separate hyper-parameters for each task.</p>
        </para>
<!--  %while␣a␣temporal␣GAN,␣Encoder-Decoder␣and␣LSTM␣obtain␣performances␣0.44,␣0.52,␣and␣0.44␣\citep{smith2019modeling}.␣The␣approach␣in␣\citet{smith2019modeling}␣can␣be␣seen␣as␣an␣upper␣bound␣for␣the␣performance␣of␣our␣model,␣since␣they␣use␣a␣segmentation␣model␣trained␣on␣supervised␣labels␣of␣masks␣of␣individual␣objects,␣as␣well␣as␣a␣projection␣labels␣trained␣on␣ground␣truth␣segmentation␣masks␣to␣underlying␣physical␣primitives.␣In␣contrast,␣we␣discover␣3D␣physical␣objects␣in␣an␣unsupervised␣manner. 
     %We␣provide␣a␣qualitative␣illustration␣of␣surprisal␣in␣\fig{fig:surprisal}.␣Our␣approach␣exhibits␣a␣large␣spike␣of␣surprisal␣when␣the␣cube␣unexpectedly␣appears␣across␣the␣wall.
     %****␣experiments.tex␣Line␣125␣****
     %\vspace{1mm}
     %\vspace{-5pt}-->      </paragraph>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S5">
    <tags>
      <tag>5</tag>
      <tag role="autoref">section 5</tag>
      <tag role="refnum">5</tag>
      <tag role="typerefnum">§5</tag>
    </tags>
    <title><tag close=" ">5</tag>Conclusion</title>
<!--  %\vspace{-5pt} -->    <para xml:id="S5.p1">
      <p>We have proposed POD-Net, a model that discovers 3D physical objects from video using self-supervision. We show that by retaining principles of core knowledge in our architecture – that objects exist and move smoothly – and by factorizing object segmentation across sub-patches, we can learn to segment and discover objects in a generalizable fashion.
<!--  %advantages␣of␣using␣such␣a␣representation,␣such␣as␣being␣to␣enforce␣physical␣consistency,␣and␣show␣that␣it␣leads␣to␣better␣generalization␣in␣segmentation. -->We further show how these discovered objects can be utilized in downstream tasks to judge physical plausibility. We believe further exploration in this direction, such as integration of more flexible representation of physical dynamics <cite class="ltx_citemacro_citep">(<bibref bibrefs="mrowca2018flexible,sanchez2020physics" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, is a promising approach towards more robust object discovery and a richer understanding of the physical world around us.</p>
    </para>
    <paragraph inlist="toc" xml:id="S5.SS0.SSS0.Px1">
      <title>Acknowledgements.</title>
      <para xml:id="S5.SS0.SSS0.Px1.p1">
        <p>This work is in part supported by ONR MURI N00014-16-1-2007, the Center for Brain, Minds, and Machines (CBMM, funded by NSF STC award CCF-1231216), the Samsung Global Research Outreach (GRO) Program, Toyota Research Institute, and Autodesk. Yilun Du is supported in part by an NSF graduate research fellowship.</p>
      </para>
<!--  %\input{text/broader_impacts} -->    </paragraph>
  </section>
  <bibliography citestyle="authoryear" files="reference,object_motion" xml:id="bib">
    <title>References</title>
  </bibliography>
<!--  %****␣iclr2021_conference.tex␣Line␣75␣**** -->  <pagination role="newpage"/>
  <appendix inlist="toc" labels="LABEL:appendix" xml:id="A1">
    <tags>
      <tag>Appendix A</tag>
      <tag role="autoref">Appendix A</tag>
      <tag role="refnum">A</tag>
      <tag role="typerefnum">Appendix A</tag>
    </tags>
    <title><tag close=" ">Appendix A</tag>Appendix</title>
    <toctitle><tag close=" ">A</tag>Appendix</toctitle>
    <subsection inlist="toc" labels="LABEL:app:nc" xml:id="A1.SS1">
      <tags>
        <tag>A.1</tag>
        <tag role="autoref">subsection A.1</tag>
        <tag role="refnum">A.1</tag>
        <tag role="typerefnum">§A.1</tag>
      </tags>
      <title><tag close=" ">A.1</tag>Normalized Cut/ Crisp Boundary Detection Qualitative Visualization</title>
      <para xml:id="A1.SS1.p1">
        <p>We provide visualizations of segmentations using normalized cuts in Figure <ref labelref="LABEL:fig:normalized_cut"/>. Normalized cuts relies on local color similarity and spatial locality to determine segments of objects. Since our objects area relatively similar in color, it is able to segment the rough shape of objects. However, compared to POD-Net, normalized cuts results in significantly less sharp boundaries around each shape and leads to over-segmentation of an individual object into multiple separate objects. This is due to the fact that shapes still exhibit variations in object color from lighting that causes normalizing cuts to incorrectly segment the object.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:normalized_cut" placement="H" xml:id="A1.F1">
        <tags>
          <tag><text fontsize="90%">Figure A1</text></tag>
          <tag role="autoref">Figure A1</tag>
          <tag role="refnum">A1</tag>
          <tag role="typerefnum">Figure A1</tag>
        </tags>
<!--  %\vspace{-6mm} -->        <graphics candidates="fig/adept_normalized.pdf" graphic="fig/adept_normalized.pdf" options="width=433.62pt" xml:id="A1.F1.g1"/>
<!--  %\vspace{-15pt} -->        <toccaption><tag close=" ">A1</tag>Illustration of segmentation using normalized cuts. Normalized cuts often over-segments objects and sometimes misses segmentation of small objects.</toccaption>
        <caption><tag close=": "><text fontsize="90%">Figure A1</text></tag><text fontsize="90%">Illustration of segmentation using normalized cuts. Normalized cuts often over-segments objects and sometimes misses segmentation of small objects.</text></caption>
      </figure>
      <para xml:id="A1.SS1.p2">
        <p>We further provide visualizations of segmentations using crisp boundary detection in Figure <ref labelref="LABEL:fig:crisp_boundary"/>. Crisp boundary detection detects the set of edges that determines an objects. This enables the approach to segment large objects in a scene, where edges are not ambiguous, but fails to accurately segment smaller objects in a scene, which have more ambiguous edges. Furthermore, detected edges are also sensitive to the lighting of an object, sometimes segmenting an object into multiple separate pieces.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:crisp_boundary" placement="H" xml:id="A1.F2">
        <tags>
          <tag><text fontsize="90%">Figure A2</text></tag>
          <tag role="autoref">Figure A2</tag>
          <tag role="refnum">A2</tag>
          <tag role="typerefnum">Figure A2</tag>
        </tags>
<!--  %\vspace{-6mm} -->        <graphics candidates="fig/crisp_boundary.pdf" graphic="fig/crisp_boundary.pdf" options="width=433.62pt" xml:id="A1.F2.g1"/>
<!--  %\vspace{-15pt} -->        <toccaption><tag close=" ">A2</tag> Illustration of segmentation using crisp boundary detection. Crisp boundary detection is able to segment large objects in a scene, but fails to accurately segment smaller objects in a scene, and sometimes segments objects to multiple small pieces.</toccaption>
        <caption><tag close=": "><text fontsize="90%">Figure A2</text></tag><text fontsize="90%"> Illustration of segmentation using crisp boundary detection. Crisp boundary detection is able to segment large objects in a scene, but fails to accurately segment smaller objects in a scene, and sometimes segments objects to multiple small pieces.</text></caption>
      </figure>
<!--  %\subsection{Scatter␣Plot␣of␣Predicted␣3D␣Primitives} 
     %\input{figText/both_3d}
     %We␣provide␣scatter␣plots␣of␣inferred␣and␣ground␣truth␣3D␣scale␣and␣translations␣on␣the␣Moving␣ShapeNet␣dataset␣in␣\fig{fig:scatter_3d}.␣\model␣is␣able␣infer␣3D␣primitive␣translations␣and␣scales␣with␣correlation␣with␣ground␣truth␣3d␣primitives␣scale␣and␣translation.-->    </subsection>
    <subsection inlist="toc" labels="LABEL:app:details" xml:id="A1.SS2">
      <tags>
        <tag>A.2</tag>
        <tag role="autoref">subsection A.2</tag>
        <tag role="refnum">A.2</tag>
        <tag role="typerefnum">§A.2</tag>
      </tags>
      <title><tag close=" ">A.2</tag>Details on Manually Designed Projection Modules</title>
      <para xml:id="A1.SS2.p1">
        <p>To manually design a backprojection and projection model, we assume each physical object <Math mode="inline" tex="({\bm{t}}_{k},{\bm{s}}_{k},{\bm{q}}_{k})" text="vector@(t _ k, s _ k, q _ k)" xml:id="A1.SS2.p1.m1">
            <XMath>
              <XMDual>
                <XMApp>
                  <XMTok meaning="vector"/>
                  <XMRef idref="A1.SS2.p1.m1.1"/>
                  <XMRef idref="A1.SS2.p1.m1.2"/>
                  <XMRef idref="A1.SS2.p1.m1.3"/>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp xml:id="A1.SS2.p1.m1.1">
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                  <XMApp xml:id="A1.SS2.p1.m1.2">
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                  <XMApp xml:id="A1.SS2.p1.m1.3">
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="bold italic" role="UNKNOWN">q</XMTok>
                    <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math> has a fixed rotation <Math mode="inline" tex="q_{k}" text="q _ k" xml:id="A1.SS2.p1.m2">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">q</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
              </XMApp>
            </XMath>
          </Math> and z-axis length <Math mode="inline" tex="s_{zk}" text="s _ (z * k)" xml:id="A1.SS2.p1.m3">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">s</XMTok>
                <XMApp>
                  <XMTok meaning="times" role="MULOP">⁢</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">z</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>, and regress other size and position parameters. For our backprojection model, given a segmentation mask <Math mode="inline" tex="{\bm{m}}_{k}" text="m _ k" xml:id="A1.SS2.p1.m4">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">m</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
              </XMApp>
            </XMath>
          </Math> of our object, we determine the 2D bounds of the mask <Math mode="inline" tex="x_{\min}" text="x _ minimum" xml:id="A1.SS2.p1.m5">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">x</XMTok>
                <XMTok fontsize="70%" meaning="minimum" role="OPFUNCTION" scriptpos="post">min</XMTok>
              </XMApp>
            </XMath>
          </Math>, <Math mode="inline" tex="x_{\max}" text="x _ maximum" xml:id="A1.SS2.p1.m6">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">x</XMTok>
                <XMTok fontsize="70%" meaning="maximum" role="OPFUNCTION" scriptpos="post">max</XMTok>
              </XMApp>
            </XMath>
          </Math>, <Math mode="inline" tex="y_{\min}" text="y _ minimum" xml:id="A1.SS2.p1.m7">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">y</XMTok>
                <XMTok fontsize="70%" meaning="minimum" role="OPFUNCTION" scriptpos="post">min</XMTok>
              </XMApp>
            </XMath>
          </Math>, <Math mode="inline" tex="y_{\max}" text="y _ maximum" xml:id="A1.SS2.p1.m8">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="italic" role="UNKNOWN">y</XMTok>
                <XMTok fontsize="70%" meaning="maximum" role="OPFUNCTION" scriptpos="post">max</XMTok>
              </XMApp>
            </XMath>
          </Math> in a differentiable manner, by taking the segmentation mask weighted mean of of boundary pixels (as defined by the 200 most boundary pixels in each direction). We then set <Math mode="inline" tex="{\bm{t}}_{z}=1+\alpha y_{\min}" text="t _ z = 1 + alpha * y _ minimum" xml:id="A1.SS2.p1.m9">
            <XMath>
              <XMApp>
                <XMTok meaning="equals" role="RELOP">=</XMTok>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                  <XMTok font="italic" fontsize="70%" role="UNKNOWN">z</XMTok>
                </XMApp>
                <XMApp>
                  <XMTok meaning="plus" role="ADDOP">+</XMTok>
                  <XMTok meaning="1" role="NUMBER">1</XMTok>
                  <XMApp>
                    <XMTok meaning="times" role="MULOP">⁢</XMTok>
                    <XMTok font="italic" name="alpha" role="UNKNOWN">α</XMTok>
                    <XMApp>
                      <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                      <XMTok font="italic" role="UNKNOWN">y</XMTok>
                      <XMTok fontsize="70%" meaning="minimum" role="OPFUNCTION" scriptpos="post">min</XMTok>
                    </XMApp>
                  </XMApp>
                </XMApp>
              </XMApp>
            </XMath>
          </Math>, corresponding to the assumption that higher segmentation masks correspond to further away objects. We compute the remaining coordinates of <Math mode="inline" tex="{\bm{t}}_{k}" text="t _ k" xml:id="A1.SS2.p1.m10">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">t</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
              </XMApp>
            </XMath>
          </Math> and <Math mode="inline" tex="{\bm{s}}_{k}" text="s _ k" xml:id="A1.SS2.p1.m11">
            <XMath>
              <XMApp>
                <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                <XMTok font="bold italic" role="UNKNOWN">s</XMTok>
                <XMTok font="italic" fontsize="70%" role="UNKNOWN">k</XMTok>
              </XMApp>
            </XMath>
          </Math> by inverting a camera intrinsic matrix on the 2D bounds of the mask and setting the center of an object to be halfway between the extremes the resultant coordinates. We note that our backprojection model is fully differentiable. For our projection model, we utilize the camera matrix to explicitly project inferred primitives back to the segmentation mask.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="A1.SS3">
      <tags>
        <tag>A.3</tag>
        <tag role="autoref">subsection A.3</tag>
        <tag role="refnum">A.3</tag>
        <tag role="typerefnum">§A.3</tag>
      </tags>
      <title><tag close=" ">A.3</tag>Per Category Segmentation Performance</title>
<!--  %****␣appendix.tex␣Line␣25␣**** -->      <para xml:id="A1.SS3.p1">
        <p>We report per category segmentation performance on ShapeNet objects below:
pillow: 0.554, mug: 0.450, rocket: 0.347, earphone: 0.643, computer keyboard: 0.696, bus: 0.694, camera: 0.725, bowl: 0.565, bookshelf: 0.648, stove: 0.587, birdhouse: 0.544, wine bottle: 0.695, bench: 0.404, microwave: 0.314, lamp: 0.300, pistol: 0.566, chair: 0.465, cabinet: 0.646, bag: 0.602, rifle: 0.497, file: 0.467, faucet: 0.367, car: 0.594, bathtub: 0.500, microphone: 0.530, ashcan: 0.729, basket: 0.752, knife: 0.676, mailbox: 0.578, table: 0.565, printer: 0.660, cap: 0.559, sofa: 0.404, vessel: 0.404, display: 0.771, loudspeaker: 0.646, bicycle: 0.598, remote: 0.720, helmet: 0.368, train: 0.567, telephone: 0.601, jar: 0.663, piano: 0.734, washer: 0.358.</p>
      </para>
      <para xml:id="A1.SS3.p2">
        <p>We find that despite having an internal physics representation of a cube, there is relatively little correlation between well-segmented objects and cubeness, with POD-Net performing well on non-cuboid classes such as piano while performing poorly on cuboid classes such as washer. Our physics loss instead encourages segmented objects across time to translate uniformly as well as maintain size.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="A1.SS4">
      <tags>
        <tag>A.4</tag>
        <tag role="autoref">subsection A.4</tag>
        <tag role="refnum">A.4</tag>
        <tag role="typerefnum">§A.4</tag>
      </tags>
      <title><tag close=" ">A.4</tag>Model Architecture</title>
      <para xml:id="A1.SS4.p1">
        <p>We detail our attention model in Table <ref labelref="LABEL:fig:attention"/> and our component VAE model in Table <ref labelref="LABEL:fig:vae"/>. In contrast to <cite class="ltx_citemacro_citet"><bibref bibrefs="burgess2019monet" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>, we use a residual architecture for both attention and component VAE networks, with up-sampling of the spatial broadcast layer.</p>
      </para>
      <table inlist="lot" labels="LABEL:fig:architecture" placement="H" xml:id="A1.T1">
        <tags>
          <tag><text fontsize="90%">Table A1</text></tag>
          <tag role="autoref">Table A1</tag>
          <tag role="refnum">A1</tag>
          <tag role="typerefnum">Table A1</tag>
        </tags>
        <figure inlist="lof" labels="LABEL:fig:attention" placement="t" xml:id="A1.F2.sf1">
          <tags>
            <tag><text fontsize="90%">(a)</text></tag>
            <tag role="autoref">(a)</tag>
            <tag role="refnum">2(a)</tag>
          </tags>
          <tabular class="ltx_centering ltx_guessed_headers" vattach="middle">
            <thead>
              <tr>
                <td align="center" border="tt tt" thead="column">7x7 Conv2D, 32</td>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="center" border="t">BatchNorm</td>
              </tr>
              <tr>
                <td align="center" border="t">3x3 Max Pool (Stride 2)</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Down 32</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Down 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Down 128</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 256</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 128</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 32</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 32</td>
              </tr>
              <tr>
                <td align="center" border="bb t">3x3 Conv2D, Output Channels</td>
              </tr>
            </tbody>
          </tabular>
          <toccaption class="ltx_centering"><tag close=" ">(a)</tag>Attention Model (<Math mode="inline" tex="\alpha_{\psi}" text="alpha _ psi" xml:id="A1.F2.sf1.m1">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" name="alpha" role="UNKNOWN">α</XMTok>
                  <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                </XMApp>
              </XMath>
            </Math>)</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(a)</text></tag><text fontsize="90%">Attention Model (<Math mode="inline" tex="\alpha_{\psi}" text="alpha _ psi" xml:id="A1.F2.sf1.m2">
                <XMath>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" name="alpha" role="UNKNOWN">α</XMTok>
                    <XMTok font="italic" fontsize="70%" name="psi" role="UNKNOWN">ψ</XMTok>
                  </XMApp>
                </XMath>
              </Math>)</text></caption>
        </figure>
        <figure inlist="lof" labels="LABEL:fig:vae" placement="t" xml:id="A1.F2.sf2">
          <tags>
            <tag><text fontsize="90%">(b)</text></tag>
            <tag role="autoref">(b)</tag>
            <tag role="refnum">2(b)</tag>
          </tags>
          <tabular class="ltx_centering" vattach="middle">
            <tbody>
              <tr>
                <td align="center" border="tt tt">7x7 Conv2D, 32</td>
              </tr>
              <tr>
                <td align="center" border="t">BatchNorm</td>
              </tr>
              <tr>
                <td align="center" border="t">3x3 Max Pool (Stride 2)</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Down 16</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Down 32</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Down 64</td>
              </tr>
              <tr>
                <td align="center" border="t">Global Average Pool</td>
              </tr>
              <tr>
                <td align="center" border="t">Dense <Math mode="inline" tex="\rightarrow" text="rightarrow" xml:id="A1.F2.sf2.m1">
                    <XMath>
                      <XMTok name="rightarrow" role="ARROW">→</XMTok>
                    </XMath>
                  </Math> 256</td>
              </tr>
              <tr>
                <td align="center" border="t">256 <Math mode="inline" tex="\rightarrow" text="rightarrow" xml:id="A1.F2.sf2.m2">
                    <XMath>
                      <XMTok name="rightarrow" role="ARROW">→</XMTok>
                    </XMath>
                  </Math> 32 (<Math mode="inline" tex="\mu" text="mu" xml:id="A1.F2.sf2.m3">
                    <XMath>
                      <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                    </XMath>
                  </Math>, <Math mode="inline" tex="\sigma" text="sigma" xml:id="A1.F2.sf2.m4">
                    <XMath>
                      <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                    </XMath>
                  </Math>)</td>
              </tr>
              <tr>
                <td align="center" border="t">z <Math mode="inline" tex="\leftarrow" text="leftarrow" xml:id="A1.F2.sf2.m5">
                    <XMath>
                      <XMTok name="leftarrow" role="ARROW">←</XMTok>
                    </XMath>
                  </Math> <Math mode="inline" tex="\mathcal{N}" text="N" xml:id="A1.F2.sf2.m6">
                    <XMath>
                      <XMTok font="caligraphic" role="UNKNOWN">N</XMTok>
                    </XMath>
                  </Math>(<Math mode="inline" tex="\mu" text="mu" xml:id="A1.F2.sf2.m7">
                    <XMath>
                      <XMTok font="italic" name="mu" role="UNKNOWN">μ</XMTok>
                    </XMath>
                  </Math>, <Math mode="inline" tex="\sigma" text="sigma" xml:id="A1.F2.sf2.m8">
                    <XMath>
                      <XMTok font="italic" name="sigma" role="UNKNOWN">σ</XMTok>
                    </XMath>
                  </Math>)</td>
              </tr>
              <tr>
                <td align="center" border="t">Spatial Broadcast <Math mode="inline" tex="z" text="z" xml:id="A1.F2.sf2.m9">
                    <XMath>
                      <XMTok font="italic" role="UNKNOWN">z</XMTok>
                    </XMath>
                  </Math> (8x)</td>
              </tr>
              <tr>
                <td align="center" border="t">3x3 Conv2d, 256</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock up 128</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock up 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock up 32</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock up 16</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock up 16</td>
              </tr>
              <tr>
                <td align="center" border="bb t">3x3 Conv2D, Output Channels</td>
              </tr>
            </tbody>
          </tabular>
          <toccaption class="ltx_centering"><tag close=" ">(b)</tag>VAE Component Model. (<Math mode="inline" tex="q_{\phi}" text="q _ phi" xml:id="A1.F2.sf2.m10">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">q</XMTok>
                  <XMTok font="italic" fontsize="70%" name="phi" role="UNKNOWN">ϕ</XMTok>
                </XMApp>
              </XMath>
            </Math>, <Math mode="inline" tex="p_{\theta}" text="p _ theta" xml:id="A1.F2.sf2.m11">
              <XMath>
                <XMApp>
                  <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                  <XMTok font="italic" role="UNKNOWN">p</XMTok>
                  <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                </XMApp>
              </XMath>
            </Math>) </toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(b)</text></tag><text fontsize="90%">VAE Component Model. (<Math mode="inline" tex="q_{\phi}" text="q _ phi" xml:id="A1.F2.sf2.m12">
                <XMath>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" role="UNKNOWN">q</XMTok>
                    <XMTok font="italic" fontsize="70%" name="phi" role="UNKNOWN">ϕ</XMTok>
                  </XMApp>
                </XMath>
              </Math>, <Math mode="inline" tex="p_{\theta}" text="p _ theta" xml:id="A1.F2.sf2.m13">
                <XMath>
                  <XMApp>
                    <XMTok role="SUBSCRIPTOP" scriptpos="post1"/>
                    <XMTok font="italic" role="UNKNOWN">p</XMTok>
                    <XMTok font="italic" fontsize="70%" name="theta" role="UNKNOWN">θ</XMTok>
                  </XMApp>
                </XMath>
              </Math>) </text></caption>
        </figure>
        <toccaption><tag close=" ">A1</tag>Overall Model Architectures used in POD-Net </toccaption>
        <caption><tag close=": "><text fontsize="90%">Table A1</text></tag><text fontsize="90%">Overall Model Architectures used in POD-Net </text></caption>
      </table>
      <para xml:id="A1.SS4.p2">
        <p>We detail the architecture of our Backprojection and Projection Models in Table <ref labelref="LABEL:fig:proj"/>.</p>
      </para>
      <table inlist="lot" labels="LABEL:fig:proj" placement="H" xml:id="A1.T2">
        <tags>
          <tag><text fontsize="90%">Table A2</text></tag>
          <tag role="autoref">Table A2</tag>
          <tag role="refnum">A2</tag>
          <tag role="typerefnum">Table A2</tag>
        </tags>
        <figure inlist="lof" labels="LABEL:fig:unproject" placement="t" xml:id="A1.F2.sf3">
          <tags>
            <tag><text fontsize="90%">(c)</text></tag>
            <tag role="autoref">(c)</tag>
            <tag role="refnum">2(c)</tag>
          </tags>
<!--  %****␣appendix.tex␣Line␣125␣**** -->          <tabular class="ltx_centering" vattach="middle">
            <tbody>
              <tr>
                <td align="center" border="tt tt">ResNet 18</td>
              </tr>
              <tr>
                <td align="center" border="bb t">Dense <Math mode="inline" tex="\rightarrow" text="rightarrow" xml:id="A1.F2.sf3.m1">
                    <XMath>
                      <XMTok name="rightarrow" role="ARROW">→</XMTok>
                    </XMath>
                  </Math> 7</td>
              </tr>
            </tbody>
          </tabular>
          <toccaption class="ltx_centering"><tag close=" ">(c)</tag>Architecture of Backprojection Model</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(c)</text></tag><text fontsize="90%">Architecture of Backprojection Model</text></caption>
        </figure>
        <figure inlist="lof" labels="LABEL:fig:project" placement="t" xml:id="A1.F2.sf4">
          <tags>
            <tag><text fontsize="90%">(d)</text></tag>
            <tag role="autoref">(d)</tag>
            <tag role="refnum">2(d)</tag>
          </tags>
          <tabular class="ltx_centering ltx_guessed_headers" vattach="middle">
            <thead>
              <tr>
                <td align="center" border="tt tt" thead="column">Dense <Math mode="inline" tex="\rightarrow" text="rightarrow" xml:id="A1.F2.sf4.m1">
                    <XMath>
                      <XMTok name="rightarrow" role="ARROW">→</XMTok>
                    </XMath>
                  </Math> 512</td>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="center" border="t">512 <Math mode="inline" tex="\rightarrow" text="rightarrow" xml:id="A1.F2.sf4.m2">
                    <XMath>
                      <XMTok name="rightarrow" role="ARROW">→</XMTok>
                    </XMath>
                  </Math> 1024</td>
              </tr>
              <tr>
                <td align="center" border="t">View 64 x 4 x 4</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 64</td>
              </tr>
              <tr>
                <td align="center" border="t">ResBlock Up 32</td>
              </tr>
              <tr>
                <td align="center" border="bb t">3x3 Conv2d, Output Channels</td>
              </tr>
            </tbody>
          </tabular>
          <toccaption class="ltx_centering"><tag close=" ">(d)</tag>Architecture of Projection Model</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(d)</text></tag><text fontsize="90%">Architecture of Projection Model</text></caption>
        </figure>
        <toccaption><tag close=" ">A2</tag>Overall Model Architectures used in POD-Net </toccaption>
        <caption><tag close=": "><text fontsize="90%">Table A2</text></tag><text fontsize="90%">Overall Model Architectures used in POD-Net </text></caption>
      </table>
<!--  %\subsection{Source␣Code} 
     %We␣attach␣anonymous␣source␣code␣used␣to␣train␣models␣in␣the␣CMT␣submission␣portal.-->    </subsection>
    <subsection inlist="toc" labels="LABEL:lastpage" xml:id="A1.SS5">
      <tags>
        <tag>A.5</tag>
        <tag role="autoref">subsection A.5</tag>
        <tag role="refnum">A.5</tag>
        <tag role="typerefnum">§A.5</tag>
      </tags>
      <title><tag close=" ">A.5</tag>Comparison on Partially Occluded Objects</title>
<!--  %****␣appendix.tex␣Line␣175␣**** -->      <para xml:id="A1.SS5.p1">
        <p>We further explicitly compare the performance of POD-Net on segmenting objects that occlude each other. We evaluate on the ADEPT dataset, but only consider objects such that the bounding boxes intersect. We find that in this dataset of objects, POD-Net (multi-scale, physics) obtains has a detection rate of 0.734, with the an average IoU of 0.701 while POD-Net (multi-scale, no physics) obtains a detection rate of 0.601 (IoU threshold 0.5) with an average IoU of 0.576. This indicates our approach in incorporating physics is able to learn to effectively separate objects that partially occlude each other.

<!--  %\appendix 
     %\section{Appendix}
     %You␣may␣include␣other␣additional␣sections␣here.--></p>
      </para>
    </subsection>
  </appendix>
</document>
