<?xml version="1.0" encoding="UTF-8"?>
<?latexml searchpaths="/home/japhy/scienceReplication.artiswrong.com/paper_files/arxiv/2407.17773/latex_extracted"?>
<?latexml class="article"?>
<?latexml package="iclr2025_conference,times"?>
<?latexml package="amsmath,amsfonts,bm"?>
<!--  %**** math˙commands.tex Line 25 **** --><!--  %**** math˙commands.tex Line 50 **** --><!--  %**** math˙commands.tex Line 75 **** --><!--  %**** math˙commands.tex Line 100 **** --><!--  %**** math˙commands.tex Line 125 **** --><!--  %**** math˙commands.tex Line 150 **** --><!--  %**** math˙commands.tex Line 175 **** --><!--  %**** math˙commands.tex Line 200 **** --><!--  %**** math˙commands.tex Line 225 **** --><!--  %**** math˙commands.tex Line 250 **** --><!--  %**** math˙commands.tex Line 275 **** --><!--  %**** math˙commands.tex Line 300 **** --><!--  %**** math˙commands.tex Line 325 **** --><!--  %**** math˙commands.tex Line 350 **** --><!--  %**** math˙commands.tex Line 375 **** --><!--  %**** math˙commands.tex Line 400 **** --><!--  %**** math˙commands.tex Line 425 **** --><!--  %**** math˙commands.tex Line 450 **** --><?latexml package="color,xcolor"?>
<?latexml package="epsfig"?>
<?latexml package="graphicx"?>
<?latexml package="adjustbox"?>
<?latexml package="array"?>
<?latexml package="booktabs"?>
<?latexml package="colortbl"?>
<?latexml package="wrapfig"?>
<?latexml package="hhline"?>
<?latexml package="multirow"?>
<?latexml package="inputenc" options="utf8"?>
<?latexml package="fontenc" options="T1"?>
<?latexml package="amsmath,amsfonts,amssymb"?>
<?latexml package="bm"?>
<?latexml package="nicefrac"?>
<?latexml package="microtype"?>
<?latexml package="mathtools"?>
<?latexml package="changepage"?>
<?latexml package="extramarks"?>
<?latexml package="fancyhdr"?>
<?latexml package="lastpage"?>
<?latexml package="setspace"?>
<?latexml package="soul"?>
<?latexml package="xspace"?>
<?latexml package="hyperref" options="pagebackref=true,breaklinks=true,colorlinks,citecolor=gray"?>
<?latexml package="url"?>
<?latexml package="enumerate"?>
<?latexml package="enumitem"?>
<?latexml package="makecell"?>
<?latexml package="pifont"?>
<?latexml package="algorithm,algpseudocode"?>
<?latexml package="amsthm"?>
<?latexml package="float"?>
<?latexml package="footmisc" options="bottom"?>
<!--  %****␣macros.tex␣Line␣25␣**** --><!--  %****␣macros.tex␣Line␣50␣**** --><!--  %****␣macros.tex␣Line␣75␣**** --><?latexml package="soul"?>
<!--  %****␣macros.tex␣Line␣100␣**** --><?latexml package="multirow"?>
<?latexml package="subcaption"?>
<?latexml package="wrapfig"?>
<?latexml package="duckuments"?>
<?latexml package="graphicx"?>
<?latexml package="tikz"?>
<?latexml package="microtype"?>
<!--  %****␣iclr2025_conference.tex␣Line␣25␣**** --><!--  %****␣iclr2025_conference.tex␣Line␣50␣**** --><?latexml RelaxNGSchema="LaTeXML"?>
<document xmlns="http://dlmf.nist.gov/LaTeXML" class="ltx_authors_1line">
  <resource src="LaTeXML.css" type="text/css"/>
  <resource src="ltx-article.css" type="text/css"/>
  <title>KiVA: Kid-inspired Visual Analogies <break/>for Testing Large Multimodal Models</title>
  <creator role="author">
    <personname>Eunice Yiu<Math mode="inline" tex="{}^{1}" text="^1" xml:id="m1">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
          </XMApp>
        </XMath>
      </Math>
 Maan Qraitem<Math mode="inline" tex="{}^{2}" text="^2" xml:id="m2">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
          </XMApp>
        </XMath>
      </Math>
 Anisa Noor Majhi<Math mode="inline" tex="{}^{1}" text="^1" xml:id="m3">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
          </XMApp>
        </XMath>
      </Math>
 Charlie Wong<Math mode="inline" tex="{}^{1}" text="^1" xml:id="m4">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
          </XMApp>
        </XMath>
      </Math>
 Yutong Bai<Math mode="inline" tex="{}^{1}" text="^1" xml:id="m5">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
          </XMApp>
        </XMath>
      </Math> <break/><text font="bold">Shiry Ginosar<Math mode="inline" tex="{}^{3,4}" text="^list@(3, 4)" xml:id="m6">
          <XMath>
            <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
              <XMDual>
                <XMApp>
                  <XMTok meaning="list"/>
                  <XMRef idref="m6.1"/>
                  <XMRef idref="m6.2"/>
                </XMApp>
                <XMWrap>
                  <XMTok font="medium" fontsize="70%" meaning="3" role="NUMBER" xml:id="m6.1">3</XMTok>
                  <XMTok font="medium" fontsize="70%" role="PUNCT">,</XMTok>
                  <XMTok font="medium" fontsize="70%" meaning="4" role="NUMBER" xml:id="m6.2">4</XMTok>
                </XMWrap>
              </XMDual>
            </XMApp>
          </XMath>
        </Math></text>
 <text font="bold">Alison Gopnik<Math mode="inline" tex="{}^{1}" text="^1" xml:id="m7">
          <XMath>
            <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
              <XMTok font="medium" fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
            </XMApp>
          </XMath>
        </Math></text>
 <text font="bold">Kate Saenko<Math mode="inline" tex="{}^{2}" text="^2" xml:id="m8">
          <XMath>
            <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
              <XMTok font="medium" fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
            </XMApp>
          </XMath>
        </Math></text> <break/><Math mode="inline" tex="{}^{1}" text="^1" xml:id="m9">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="1" role="NUMBER">1</XMTok>
          </XMApp>
        </XMath>
      </Math> University of California, Berkeley
 <Math mode="inline" tex="{}^{2}" text="^2" xml:id="m10">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="2" role="NUMBER">2</XMTok>
          </XMApp>
        </XMath>
      </Math> Boston University
 <Math mode="inline" tex="{}^{3}" text="^3" xml:id="m11">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="3" role="NUMBER">3</XMTok>
          </XMApp>
        </XMath>
      </Math> Google DeepMind<break/><Math mode="inline" tex="{}^{4}" text="^4" xml:id="m12">
        <XMath>
          <XMApp role="FLOATSUPERSCRIPT" scriptpos="1">
            <XMTok fontsize="70%" meaning="4" role="NUMBER">4</XMTok>
          </XMApp>
        </XMath>
      </Math> Toyota Technological Institute at Chicago
</personname>
  </creator>
  <abstract name="Abstract">
    <p>This paper investigates visual analogical reasoning in large multimodal models (LMMs) compared to human adults and children. A “visual analogy” is an abstract rule inferred from one image and applied to another.
While benchmarks exist for testing visual reasoning in LMMs, they require advanced skills and omit basic visual analogies that even young children can make. Inspired by developmental psychology, we propose a new benchmark of 4,300 visual transformations of everyday objects to test LMMs on visual analogical reasoning and compare them to children (ages three to five) and to adults. We structure the evaluation into three stages: identifying <text font="italic">what</text> changed (e.g., color, number, etc.), <text font="italic">how</text> it changed (e.g., added one object), and <text font="italic">applying the rule</text> to new scenarios. Our findings show that while GPT-o1, GPT-4V, LLaVA-1.5, and MANTIS identify the “what” effectively, they struggle with quantifying the “how” and extrapolating this rule to new objects. In contrast, children and adults exhibit much stronger analogical reasoning at all three stages. Additionally, the strongest tested model, GPT-o1, performs better in tasks involving simple surface-level visual attributes like color and size, correlating with quicker human adult response times. Conversely, more complex tasks such as number, rotation, and reflection, which necessitate extensive cognitive processing and understanding of extrinsic spatial properties in the physical world, present more significant challenges. Altogether, these findings highlight the limitations of training models on data that primarily consists of 2D images and text. <note mark="1" role="footnote" xml:id="footnote1"><tags>
          <tag>1</tag>
          <tag role="autoref">footnote 1</tag>
          <tag role="refnum">1</tag>
          <tag role="typerefnum">footnote 1</tag>
        </tags>Benchmark (code, data, models) is available at: <ref class="ltx_url" font="typewriter" href="https://github.com/ey242/KiVA">https://github.com/ey242/KiVA</ref></note></p>
  </abstract>
  <ERROR class="undefined">\iclrfinalcopy</ERROR>
  <section inlist="toc" labels="LABEL:sec:intro" xml:id="S1">
    <tags>
      <tag>1</tag>
      <tag role="autoref">section 1</tag>
      <tag role="refnum">1</tag>
      <tag role="typerefnum">§1</tag>
    </tags>
    <title><tag close=" ">1</tag>Introduction</title>
    <para xml:id="S1.p1">
      <p>What is visual cognition? Humans make countless visual inferences everyday from observing objects and scenes, quickly detecting even subtle visual changes. We generalize common patterns about changes from different observations and use these insights to solve new problems. If we put a wool sweater in the washing machine and it comes out smaller, we might infer that the wash shrinks wool and avoid washing wool coat in the future. If cookies disappear, we might infer that someone is eating our treats and and proceed to hide the chocolate elsewhere. This ability to draw parallels between situations and apply learned patterns to a new scenario is known as <text font="italic">analogical reasoning</text>. Formally defined, an analogy is a systematic comparison between structures that uses the properties and relations of objects in a source structure to infer properties and relations of objects in a target structure <cite class="ltx_citemacro_citep">(<bibref bibrefs="mitchell2021abstraction,schunn1996priming" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Analogical reasoning is a hallmark of human intelligence and learning <cite class="ltx_citemacro_citep">(<bibref bibrefs="gentner1983structure,holyoak2012analogy,mitchell2021abstraction,sternberg1977component" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. It is what enables us to be flexible, adaptive and robust learners across a wide variety of settings, finding meaning in patterns and making out-of-distribution generalizations <cite class="ltx_citemacro_citep">(<bibref bibrefs="chollet2019measure,mitchell2021abstraction" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>.
Analogical reasoning is already available to young children <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations,goswami2013analogical,sternberg1979development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, and is crucial for human problem-solving in various contexts, from building scientific models to appreciating metaphors to formulating legal arguments.</p>
    </para>
    <figure inlist="lof" labels="LABEL:fig:summary" placement="t" xml:id="S1.F1">
      <tags>
        <tag><text fontsize="90%">Figure 1</text></tag>
        <tag role="autoref">Figure 1</tag>
        <tag role="refnum">1</tag>
        <tag role="typerefnum">Figure 1</tag>
      </tags>
      <figure align="center" inlist="lof" placement="b" xml:id="S1.F0.sf1">
        <tags>
          <tag><text fontsize="90%">(a)</text></tag>
          <tag role="autoref">(a)</tag>
          <tag role="refnum">0(a)</tag>
        </tags>
        <graphics candidates="figures/figure1_analogies.pdf" graphic="figures/figure1_analogies.pdf" options="width=433.62pt" xml:id="S1.F0.sf1.g1"/>
        <toccaption><tag close=" ">(a)</tag>Visual analogy domains.</toccaption>
        <caption><tag close=" "><text fontsize="90%">(a)</text></tag><text fontsize="90%">Visual analogy domains.</text></caption>
      </figure>
      <figure align="center" inlist="lof" placement="b" xml:id="S1.F0.sf2">
        <tags>
          <tag><text fontsize="90%">(b)</text></tag>
          <tag role="autoref">(b)</tag>
          <tag role="refnum">0(b)</tag>
        </tags>
        <graphics candidates="figures/figure1_spider.pdf" graphic="figures/figure1_spider.pdf" options="width=433.62pt" xml:id="S1.F0.sf2.g1"/>
        <toccaption><tag close=" ">(b)</tag>Extrapolation accuracy.</toccaption>
        <caption><tag close=" "><text fontsize="90%">(b)</text></tag><text fontsize="90%">Extrapolation accuracy.</text></caption>
      </figure>
      <toccaption class="ltx_centering"><tag close=" ">1</tag><text font="bold">KiVA: Kid-inspired Visual Analogies.</text> <text font="bold">(a)</text> 5 visual analogy domains examined in KiVA and KiVA-adults (see Figure <ref labelref="LABEL:fig:pipeline"/> for the full task format). Unlike KiVA, the starting color, size, orientation and number of test objects in KiVA-adults further differ from the starting values of the given transformations. <text font="bold">(b)</text> Performance of children, adults &amp; LMMs in extrapolating a transformation rule to a novel object in KiVA (top) and KiVA-adults (bottom).
</toccaption>
      <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 1</text></tag><text font="bold" fontsize="90%">KiVA: Kid-inspired Visual Analogies.<text font="medium"> </text>(a)<text font="medium"> 5 visual analogy domains examined in KiVA and KiVA-adults (see Figure <ref labelref="LABEL:fig:pipeline"/> for the full task format). Unlike KiVA, the starting color, size, orientation and number of test objects in KiVA-adults further differ from the starting values of the given transformations. </text>(b)<text font="medium"> Performance of children, adults &amp; LMMs in extrapolating a transformation rule to a novel object in KiVA (top) and KiVA-adults (bottom).
</text></text></caption>
    </figure>
<!--  %****␣1_intro.tex␣Line␣25␣**** -->    <para xml:id="S1.p2">
      <p>Today, large multimodal (LMMs) have made significant progress, but they remain data-hungry and require substantial human effort to adapt to new contexts <cite class="ltx_citemacro_citep">(<bibref bibrefs="chollet2019measure,reizinger2024understanding" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. As analogical reasoning is instrumental for general-purpose and adaptive machines, it is crucial to examine whether current models have such capabilities. Critically, examining analogical capabilities does not permit models to “cheat” by merely depending on their training data because it requires context-dependent abstraction beyond general object recognition. In KiVA, the same object may undergo different kinds of transformations, requiring models to combine familiar elements in new, trial-specific ways. Reasoning about analogies involves first classifying <text font="italic">relationships</text> between object characteristics, specifying similarities and differences, then extrapolating the <text font="italic">same relationship</text> to new objects. This paper focuses on visual analogies, testing models’ ability to reason abstractly about visual observations. See Figure <ref labelref="LABEL:fig:summary"/> for a summary of the KiVA benchmark and results.</p>
    </para>
    <para xml:id="S1.p3">
      <p>There is a growing body of work examining visual reasoning and generalization capabilities in large multimodal models <cite class="ltx_citemacro_citep">(<bibref bibrefs="ahrabian2024curious,huang2024language,moskvichev2023conceptarc,petersen2023can,webb2023emergent" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Existing benchmarks of visual analogies include (a) ARC <cite class="ltx_citemacro_citep">(<bibref bibrefs="chollet2019measure" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and ConceptARC <cite class="ltx_citemacro_citep">(<bibref bibrefs="mitchell2023comparing,moskvichev2023conceptarc" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, (b) variations of Raven’s Progressive Matrices <cite class="ltx_citemacro_citep">(<bibref bibrefs="huang2024language" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and (c) abstract spatial reasoning <cite class="ltx_citemacro_citep">(<bibref bibrefs="ahrabian2024curious" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> (see prior benchmarks in Figure <ref labelref="LABEL:fig:betterbenchmark"/>). These prior benchmarks all have several critical limitations. First, they rely on abstract shapes and grids, lacking real-world relevance. This abstraction of stimuli neither aligns with the training data of large multimodal models nor effectively mimics the complexity and variability found in everyday visual tasks, making it less suitable for assessing how well AI models can perform analogical reasoning in practical contexts. Second, the transformations examined involve conjunctions of visual concepts such as extracting <text font="italic">and</text> transposing pixels according to some arbitrary rule, which do not tap into basic visual cognition. Humans do not require the ability to solve these specific tasks to function effectively in their daily lives nor to demonstrate their capacity for visual analogical reasoning. Third, while we know that models often perform poorly on these benchmarks, where they fail in the reasoning process needs to be clarified since existing evaluations focus solely on prediction accuracy rather than the reasoning approach or what is perceived.</p>
    </para>
    <figure inlist="lof" labels="LABEL:fig:betterbenchmark" xml:id="S1.F2">
      <tags>
        <tag><text fontsize="90%">Figure 2</text></tag>
        <tag role="autoref">Figure 2</tag>
        <tag role="refnum">2</tag>
        <tag role="typerefnum">Figure 2</tag>
      </tags>
      <figure align="center" inlist="lof" xml:id="S1.F1.sf1">
        <tags>
          <tag><text fontsize="90%">(a)</text></tag>
          <tag role="autoref">(a)</tag>
          <tag role="refnum">1(a)</tag>
        </tags>
        <graphics candidates="figures/figure2_other.pdf" graphic="figures/figure2_other.pdf" options="width=433.62pt" xml:id="S1.F1.sf1.g1"/>
        <toccaption><tag close=" ">(a)</tag>Prior benchmarks.</toccaption>
        <caption><tag close=" "><text fontsize="90%">(a)</text></tag><text fontsize="90%">Prior benchmarks.</text></caption>
      </figure>
      <figure align="center" inlist="lof" xml:id="S1.F1.sf2">
        <tags>
          <tag><text fontsize="90%">(b)</text></tag>
          <tag role="autoref">(b)</tag>
          <tag role="refnum">1(b)</tag>
        </tags>
        <graphics candidates="figures/figure2_ours.pdf" graphic="figures/figure2_ours.pdf" options="width=433.62pt" xml:id="S1.F1.sf2.g1"/>
        <toccaption><tag close=" ">(b)</tag>Our benchmark.</toccaption>
        <caption><tag close=" "><text fontsize="90%">(b)</text></tag><text fontsize="90%">Our benchmark.</text></caption>
      </figure>
      <toccaption class="ltx_centering"><tag close=" ">2</tag><text font="bold">Prior benchmarks versus KiVA for visual analogies.</text> <text font="bold">(a)</text> Prior benchmarks like <text font="bold">I.</text> ConceptARC, <text font="bold">II.</text> Raven’s Progressive Matrices, and <text font="bold">III.</text> CCSE Reasoning involve arbitrary changes of abstract shapes and grids. <text font="bold">(b)</text> KiVA examines basic changes that even three-year-olds can solve.</toccaption>
      <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 2</text></tag><text font="bold" fontsize="90%">Prior benchmarks versus KiVA for visual analogies.<text font="medium"> </text>(a)<text font="medium"> Prior benchmarks like </text>I.<text font="medium"> ConceptARC, </text>II.<text font="medium"> Raven’s Progressive Matrices, and </text>III.<text font="medium"> CCSE Reasoning involve arbitrary changes of abstract shapes and grids. </text>(b)<text font="medium"> KiVA examines basic changes that even three-year-olds can solve.</text></text></caption>
    </figure>
    <para xml:id="S1.p4">
      <p>We propose a Kid-inspired Visual Analogies (KiVA) benchmark founded on developmental psychology (Figure <ref labelref="LABEL:fig:summary"/> (left)) <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations,lehmann2014correlation" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. We focus our analysis on basic visual analogical capabilities that are present early in human development and are important for understanding the physical world. <text font="italic">KiVA</text> isolates the following fundamental capabilities that emerge early in human development:
<!--  %****␣1_intro.tex␣Line␣50␣**** -->detecting changes in <text font="bold">color</text>  <cite class="ltx_citemacro_citep">(<bibref bibrefs="ross2003development,wang2016infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and <text font="bold">size</text>  <cite class="ltx_citemacro_citep">(<bibref bibrefs="day1981infant,wang2016infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, changes that involve <text font="bold">rotation</text> and <text font="bold">reflection</text>  <cite class="ltx_citemacro_citep">(<bibref bibrefs="frick2013development,quaiser2003mental" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, and changes in small <text font="bold">numbers</text> of objects  <cite class="ltx_citemacro_citep">(<bibref bibrefs="cherian2023deep,levine1992development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. It is solvable by a three-year-old child. <text font="italic">KiVA-adults</text> serves as a more challenging version of KiVA that is not solvable by young children but by adults, requiring deeper generalization from given transformations (the starting values of objects in the given and test transformations are not aligned) and featuring more variations in the above visual domains (see details in Section <ref labelref="LABEL:sec:va-dataset"/>). Refer to Figure <ref labelref="LABEL:fig:summary"/> for sample test trials of KiVA and KiVA-adults. KiVA stands out in the following ways:</p>
    </para>
    <para xml:id="S1.p5">
      <p>First, our dataset utilizes <text font="italic">real-world</text>, <text font="italic">physically grounded</text> objects curated from established 3D datasets of common household items <cite class="ltx_citemacro_citep">(<bibref bibrefs="downs2022google" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and toys that are familiar to human children <cite class="ltx_citemacro_citep">(<bibref bibrefs="stojanov2021using" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, which align more with the training distribution of computer vision models and visual data of humans more than other visual analogical reasoning datasets (Figure  <ref labelref="LABEL:fig:betterbenchmark"/>).</p>
    </para>
    <para xml:id="S1.p6">
      <p>Second, our approach is inspired by <text font="italic">developmental psychology</text>, specifically how children learn to perform analogical reasoning not abstractly, but from simple objects in grounded contexts <cite class="ltx_citemacro_citep">(<bibref bibrefs="christie2010hypotheses,gentner1983structure,goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. We propose a similar approach for large multimodal models, investigating if they can perform like children on basic visual analogical reasoning tasks related to color, size, orientation, and number – as already reported in child development journals <cite class="ltx_citemacro_cite"><bibref bibrefs="coates2023representations,goddu2020transformations,goddu2025causal" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
            <bibrefphrase>(</bibrefphrase>
            <bibrefphrase>)</bibrefphrase>
          </bibref></cite>. Starting with simple, real-world relevant tasks in child development allows models to develop robust reasoning abilities before tackling more advanced tasks, providing a clearer pathway for evaluating and improving cognitive functions in AI.</p>
    </para>
    <para xml:id="S1.p7">
      <p>Third, we break down our evaluation to examine the <text font="italic">different steps</text> involved in analogical reasoning to determine which steps a model can perform and where it may fail: <text font="italic">1)</text> classifying the domain of a visual transformation, <text font="italic">2)</text> specifying the transformation rule, and <text font="italic">3)</text> extrapolating the inferred rule to a new item. This three-stage evaluation (Figure <ref labelref="LABEL:fig:pipeline"/>) gives us insights into models’ reasoning processes beyond simply selecting a correct or incorrect response at the end.</p>
    </para>
    <para xml:id="S1.p8">
      <p>Results from KiVA and KiVA-adults demonstrate that state-of-the-art large multimodal models, i.e., GPT-o1 <cite class="ltx_citemacro_citep">(<bibref bibrefs="GPT-o1" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, GPT-4V <cite class="ltx_citemacro_citep">(<bibref bibrefs="GPT-4V" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, LLaVA-1.5 <cite class="ltx_citemacro_citep">(<bibref bibrefs="liu2024visual" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and MANTIS <cite class="ltx_citemacro_citep">(<bibref bibrefs="jiang2024mantis" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, still cannot solve visual analogies like humans can. These models do not match even the capabilities of a three-year-old child in reasoning about number and reflection (Figure <ref labelref="LABEL:fig:summary"/>). While LMMs can categorize some transformations, they still struggle to extrapolate those transformations to new objects. In particular, GPT-o1 and GPT-4V outperform LLaVA-1.5 and MANTIS but also demonstrates weaker performance in orientation and number changes than in size and color changes which are processed more quickly by humans, at an earlier age <cite class="ltx_citemacro_citep">(<bibref bibrefs="slater1990size,wang2016infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, and in a more primary region of the visual cortex <cite class="ltx_citemacro_citep">(<bibref bibrefs="zeki1991direct,zeng2020visual" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>.</p>
    </para>
    <para xml:id="S1.p9">
      <p>Taken together, KiVA and KiVA-adults not only mirror the natural progression of human cognitive development, but also provides a more structured and comprehensive framework for evaluating the capabilities and growth of LMMs. We also release in our project page code for <text font="italic">KiVA-compositionality</text>, which combines multiple object transformations to probe even more complex compositional reasoning. This serves as the next benchmark for models to surpass after KiVA and KiVA-adults.</p>
    </para>
  </section>
  <section inlist="toc" xml:id="S2">
    <tags>
      <tag>2</tag>
      <tag role="autoref">section 2</tag>
      <tag role="refnum">2</tag>
      <tag role="typerefnum">§2</tag>
    </tags>
    <title><tag close=" ">2</tag>Related Work</title>
    <para class="ltx_noindent" xml:id="S2.p1">
      <p><text font="bold">Evaluating human visual analogical reasoning.</text> There is a variety of tasks designed in Developmental Psychology to examine human visual analogical reasoning early on in life. Children are asked to compare simple object and relational matches <cite class="ltx_citemacro_citep">(<bibref bibrefs="christie2010hypotheses,goddu2020transformations,kuwabara2012cross" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> along dimensions such as color <cite class="ltx_citemacro_citep">(<bibref bibrefs="milewski1975discrimination,ross2003development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, number <cite class="ltx_citemacro_citep">(<bibref bibrefs="cherian2023deep,levine1992development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, size <cite class="ltx_citemacro_citep">(<bibref bibrefs="day1981infant,slater1990size" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and spatial orientation <cite class="ltx_citemacro_citep">(<bibref bibrefs="frick2013development,quaiser2003mental" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Older children and adults are evaluated on Raven’s Progressive Matrices (RPMs) <cite class="ltx_citemacro_citep">(<bibref bibrefs="carpenter1990one,lovett2017modeling,raven1938raven" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and Bongard Problems <cite class="ltx_citemacro_citep">(<bibref bibrefs="bongard1970pattern,weitnauer2023perception" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Even though they tend to be the most representative and largest testbeds for testing advanced visual analogical reasoning, RPMs and Bongard problems use abstract geometric shapes and test recognition of arbitrary patterns that (1) cannot be solved by children before the age of 6 and (2) are not critical to everyday visual processing.
KiVA is the first visual analogical reasoning benchmark that includes common real-world objects and more natural visual cognition skills such as counting and spatial transformations — tasks that even a three-year-old child can handle <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. We also examine where people and models fail with more fine-grained evaluation.</p>
    </para>
    <para class="ltx_noindent" xml:id="S2.p2">
      <p><text font="bold">Evaluating visuo-linguistic reasoning in AI models.</text>
Several proposals for evaluating modern AI systems’ visuo-linguistic reasoning capabilities followed the recent successes of large multimodal models. Many concentrate on a narrow, isolated set of tasks for detecting object properties like size estimation <cite class="ltx_citemacro_citep">(<bibref bibrefs="chen2024spatialvlm,liu2022benchmark" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, color perception <cite class="ltx_citemacro_citep">(<bibref bibrefs="abdou2021can,samin2024colorfoil" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, counting objects <cite class="ltx_citemacro_citep">(<bibref bibrefs="liang2023crowdclip,paiss2023teaching" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, object viewpoint/pose and chirality <cite class="ltx_citemacro_citep">(<bibref bibrefs="kapelyukh2023dream2real,lin2020visual,chen2024spatialvlm" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and visuo-linguistic compositionality <cite class="ltx_citemacro_citep">(<bibref bibrefs="thrush2022winoground,kamath2023s,liu2023visual" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. Typically, the objective of these tasks is to evaluate models’ ability to report a correct property about objects in an image. They lack the depth to probe pattern abstraction and generalization involved in visual analogical reasoning.</p>
    </para>
    <para xml:id="S2.p3">
      <p>Broader benchmarks, such as visual question answering setups <cite class="ltx_citemacro_citep">(<bibref bibrefs="antol2015vqa,goyal2017making" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, attempt to investigate the models’ understanding of various visual concepts. One approach taken by <cite class="ltx_citemacro_citep">(<bibref bibrefs="bubeck2023sparks,yang2023dawn" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> was to try and push the envelope on various tasks to capture anecdotal and qualitative observations regarding the performance of GPT-4. Perception Test <cite class="ltx_citemacro_citep">(<bibref bibrefs="puatruaucean2023perception" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> proposed a second approach: a visual video-based benchmark including developmentally-inspired tasks such as object permanence, object tracking, spatial relations, etc. Recently, the BLINK benchmark was introduced to show that core visual perception tasks, easily solvable by humans "within a blink," remain challenging for large multimodal models due to their resistance to language-based mediation <cite class="ltx_citemacro_citep">(<bibref bibrefs="fu2024blink" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. However, all these benchmarks fall short in evaluating the deeper, more complex aspects of visual analogical reasoning and generalization.</p>
    </para>
    <para xml:id="S2.p4">
      <p>Another specific class of benchmarks tests generalization and reasoning within abstract puzzle grids. These include the Abstraction and Reasoning Corpus (ARC) <cite class="ltx_citemacro_citep">(<bibref bibrefs="chollet2019measure" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and ConceptARC <cite class="ltx_citemacro_citep">(<bibref bibrefs="moskvichev2023conceptarc,mitchell2023comparing" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>; a direct translation of RPMs-based human evaluation has previously been applied to models by <cite class="ltx_citemacro_citep">(<bibref bibrefs="ahrabian2024curious" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> and <cite class="ltx_citemacro_citep">(<bibref bibrefs="huang2024language" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> (see these prior benchmarks in Figure <ref labelref="LABEL:fig:betterbenchmark"/>). However, the stimuli are simple, monotonic shapes like squares and circles, lacking real-world complexity and variability. Moreover, they emphasize complex pattern recognition and logical sequencing without real-world context—neglecting basic visual cognition skills even children possess—and this limited scope may render them unsuitable for training data that typically covers a much broader range of real-world visuals.</p>
    </para>
    <para xml:id="S2.p5">
      <p>In summary, although many benchmarks assess advanced visual capabilities in large multimodal models, none evaluate visual cognition that is clearly exhibited by young children—such as predicting simple transformations of real-world objects—or use children as a baseline for comparison.</p>
    </para>
  </section>
  <section inlist="toc" labels="LABEL:sec:visual-analogies" xml:id="S3">
    <tags>
      <tag>3</tag>
      <tag role="autoref">section 3</tag>
      <tag role="refnum">3</tag>
      <tag role="typerefnum">§3</tag>
    </tags>
    <title><tag close=" ">3</tag>The KiVA Benchmark for Visual Analogical Reasoning</title>
    <para xml:id="S3.p1">
      <p>We introduce KiVA, a Kid-inspired Visual Analogies benchmark, wherein real-world objects undergo common transformations necessary for everyday visual cognition. We focus on isolating and testing basic visual transformations that even a three-year-old child understands <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. As we show in Figure <ref labelref="LABEL:fig:summary"/>, we examine noticing <text font="bold">color changes</text> <cite class="ltx_citemacro_citep">(<bibref bibrefs="ross2003development,milewski1975discrimination" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, <text font="bold">size
changes</text> <cite class="ltx_citemacro_citep">(<bibref bibrefs="day1981infant,slater1990size" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, <text font="bold">rotation</text>, <text font="bold">reflection</text> <cite class="ltx_citemacro_citep">(<bibref bibrefs="quaiser2003mental,frick2013development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, and <text font="bold">number changes</text> such as addition and subtraction of a small number of objects <cite class="ltx_citemacro_citep">(<bibref bibrefs="cherian2023deep,levine1992development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>. We then build upon this benchmark by proposing KiVA‑adults, which involves a greater variety of transformations and demands more abstract forms of generalization. It is solvable by adults but not by children under five.</p>
    </para>
    <figure inlist="lof" labels="LABEL:fig:pipeline" xml:id="S3.F3">
      <tags>
        <tag><text fontsize="90%">Figure 3</text></tag>
        <tag role="autoref">Figure 3</tag>
        <tag role="refnum">3</tag>
        <tag role="typerefnum">Figure 3</tag>
      </tags>
      <graphics candidates="figures/figure3_pipeline.pdf" class="ltx_centering" graphic="figures/figure3_pipeline.pdf" options="width=433.62pt" xml:id="S3.F3.g1"/>
      <toccaption class="ltx_centering"><tag close=" ">3</tag><text font="bold">An example of a trial in KiVA.</text> Models and humans are first asked to classify a given transformation (left). If the classification is correct (green arrow), humans and models are further evaluated on their verbal specification of the transformation (middle) and then on visual extrapolation (right). Otherwise, humans and models skip to make a visual extrapolation (yellow arrow).</toccaption>
      <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 3</text></tag><text font="bold" fontsize="90%">An example of a trial in KiVA.<text font="medium"> Models and humans are first asked to classify a given transformation (left). If the classification is correct (green arrow), humans and models are further evaluated on their verbal specification of the transformation (middle) and then on visual extrapolation (right). Otherwise, humans and models skip to make a visual extrapolation (yellow arrow).</text></text></caption>
    </figure>
    <subsection inlist="toc" xml:id="S3.SS1">
      <tags>
        <tag>3.1</tag>
        <tag role="autoref">subsection 3.1</tag>
        <tag role="refnum">3.1</tag>
        <tag role="typerefnum">§3.1</tag>
      </tags>
      <title><tag close=" ">3.1</tag>A Three-Stage Experimental Paradigm</title>
      <para xml:id="S3.SS1.p1">
        <p>We use our proposed dataset to benchmark computational models’ and human subjects’ visual analogical reasoning capabilities. We utilize the same testing procedure (Figure <ref labelref="LABEL:fig:pipeline"/>) for both kinds of subjects. In each trial, we start by presenting a given transformation of an object that changes by a specific rule, following the experimental paradigm of other analogical reasoning benchmarks for humans and computational models <cite class="ltx_citemacro_citep">(<bibref bibrefs="moskvichev2023conceptarc,bongard1970pattern,goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. Inspired by the component processes model of analogical reasoning <cite class="ltx_citemacro_citep">(<bibref bibrefs="sternberg1977component" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, we evaluate the subject’s ability to determine <text font="italic">what</text> changed (<text font="italic">Verbal Classification</text>) <text font="italic">how</text> it changed (<text font="italic">Verbal Specification</text>), and apply the the same transformation rule to predict the outcome of a new object—i.e., a <text font="italic">Visual Extrapolation</text>. We break the question down into these three steps to test the different cognitive processes involved in analogical reasoning. The first two assess the necessary prerequisites for accurate analogical reasoning, while the last step represents the core visual analogy task. Critically, KiVA retains the core nonverbal extrapolation task (last step) from previous benchmarks and the verbal questions <text font="italic">do not replace</text> the core nonverbal tasks. Even without correct verbal responses, humans and models can still tackle the independently-assessed visual extrapolation tasks. Thus, KiVA doesn’t require specific language skills but provides a window into the analogical reasoning process of humans and models in reaching their final solutions.
The first two verbal questions were further paraphrased by developmental psychologists so that it is comprehensible to a three-year-old child (Appendix <ref labelref="LABEL:sec:child-prompt"/>); models and adults did not benefit from the child-appropriate prompting so the original prompt in Figure 3 was preserved. We pose all questions in a multiple-choice format for human children, adults and models, which enables automatic scoring. Option labels for correct responses were randomized such that LMMs’ option label bias does not correlate with task accuracy. Furthermore, we provided the opportunity to select “Doesn’t apply” to accommodate responses that the provided choices may not cover. Excluding the “Doesn’t apply” option, chance level is 25% for Verbal Classification (4 choices) and 33% for Verbal Specification and Visual Extrapolation (3 choices). Refer to Figure <ref labelref="LABEL:fig:pipeline"/> for the three-stage query pipeline and Appendix <ref labelref="LABEL:sec:prompteg"/> for specific prompts.</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS1.p2">
        <p><text font="bold">Verbal classification of transformation (“what”).</text>
We first evaluate if the model or human can detect what changed in a given transformation and classify it in the correct visual domain, such as size or number (see Figure <ref labelref="LABEL:fig:pipeline"/>). We randomly sample incorrect multiple-choice options from other possible transformation domains. “No change” and “Doesn’t apply” are always included as options to accommodate for alternative forms of reasoning that are not covered by the choices. Suppose the model fails to identify basic changes, such as distinguishing a numerical change from a color change. It will be unable to predict how new objects change based on the given transformations. This is an inadequacy of existing visual analogical reasoning benchmarks <cite class="ltx_citemacro_citep">(<bibref bibrefs="moskvichev2023conceptarc,mitchell2023comparing,ahrabian2024curious,huang2024language" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, which focus solely on advanced predictions without ensuring fundamental change detection capabilities.
<!--  %****␣3_experiments.tex␣Line␣25␣**** --></p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS1.p3">
        <p><text font="bold">Verbal specification of transformation (“how”).</text>
If a subject correctly classifies the transformation, we ask them to further specify also in the form of multiple-choice the transformation (see green arrow in <ref labelref="LABEL:fig:pipeline"/>). This step is crucial because it ensures the subject can accurately specify the rule governing the transformation before extrapolating it to a new object. If they fail to identify the specific change, any attempt at extrapolation would more likely be incorrect (see Figure <ref labelref="LABEL:fig:correct-incorrect-verbal"/> in Appendix <ref labelref="LABEL:sec:conditional"/> for evidence in models). By pinpointing where reasoning fails, we can better understand models’ and humans’ limitations and improve their analogical reasoning capabilities.</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS1.p4">
        <p><text font="bold">Visual Extrapolation of transformation.</text>
Finally, we proceed to the step captured by other benchmarks: presenting a new image and asking the model to extrapolate how it will change based on the previously identified transformation (see Figure <ref labelref="LABEL:fig:pipeline"/> and other extrapolation examples of other visual domains in Appendix <ref labelref="LABEL:sec:Visual_Extrapolation"/>). We ask models to visually extrapolate independent of their performance in verbal change identification to account for the possibility that models may engage in visual analogical reasoning separately from verbal reasoning and can, therefore, perform well in visual tasks even if they struggle with the prior verbal descriptions. This approach helps us determine if a model’s visual reasoning can function independently of its verbal reasoning skills. It provides a more nuanced evaluation of its cognitive capabilities and identifies specific areas for improvement.</p>
      </para>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:va-dataset" xml:id="S3.SS2">
      <tags>
        <tag>3.2</tag>
        <tag role="autoref">subsection 3.2</tag>
        <tag role="refnum">3.2</tag>
        <tag role="typerefnum">§3.2</tag>
      </tags>
      <title><tag close=" ">3.2</tag>A Dataset of Visual Analogies</title>
      <para xml:id="S3.SS2.p1">
        <p>We create a dataset of stimuli using everyday objects that better represent real-world visual data and better match the training data of computer vision models (and humans). We take 3D models of household objects from <cite class="ltx_citemacro_cite"><bibref bibrefs="downs2022google" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite> and objects commonly encountered by infants and children from <cite class="ltx_citemacro_cite"><bibref bibrefs="stojanov2021using" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>. To set up the dataset, we perform five basic visual transformation domains: changing the size, color, and number of objects, rotating and reflecting the objects along different axes (see Figure <ref labelref="LABEL:fig:summary"/> for the transformation domains examined). Our project page includes code allowing users to perform these transformations on any object image, enabling infinite expansion of the benchmark. Our five types of object transformations are crucial for object and scene recognition, (e.g., <cite class="ltx_citemacro_cite"><bibref bibrefs="diwadkar1997viewpoint,gevers1999color" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>), scene segmentation (e.g., <cite class="ltx_citemacro_cite"><bibref bibrefs="chattopadhyay2017counting" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>), and detecting significant changes in the environment <cite class="ltx_citemacro_citep">(<bibref bibrefs="hatfield2012visual,duh2014infants" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. Other visual properties, such as depth <cite class="ltx_citemacro_citep">(<bibref bibrefs="chen2016single" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, spatial compositionality <cite class="ltx_citemacro_citep">(<bibref bibrefs="jiang2022bongard,thrush2022winoground" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, and physical affordances <cite class="ltx_citemacro_citep">(<bibref bibrefs="jiang2023bongard,sawatzky2019object" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> are also crucial for such purposes, but we prioritized these five domains for our benchmark in particular because young children can solve these visual analogies, as already shown in developmental psychology literature <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations,harris2013understanding" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. Below, we outline the five visual transformation domains. There are 100 object transformations for each subdomain of transformation, totaling 1,400 object transformations in KiVA and 2,900 in KiVA-adults.</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS2.p2">
        <p><text font="bold">Color changes.</text>
Noticing color changes can signal alterations in an object’s state or presence, which is essential for tasks like identifying ripe fruit or detecting hazards <cite class="ltx_citemacro_citep">(<bibref bibrefs="maule2023development" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. The general transformation rule for color is that input objects change to a single color <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, namely red, green and blue. KiVA-adults also includes yellow and grey.</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS2.p3">
        <p><text font="bold">Size changes.</text>
Size perception allows individuals to understand and interact with their environment accurately, guiding tasks like identifying objects, planning actions, navigating spaces, and avoiding obstacles  <cite class="ltx_citemacro_citep">(<bibref bibrefs="giudice2018navigating" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. In KiVA, objects undergo transformations in two subdomains: they turn bigger or smaller (in both height and width) as in <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> by a factor of 2. KiVA-adults also includes object stretching (changing height or width independently by a factor of 2).</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS2.p4">
        <p><text font="bold">Number changes.</text>
Accurately monitoring and comparing quantities is essential in economics and science; it is also important in daily life activities like shopping, cooking, caching and rationing <cite class="ltx_citemacro_citep">(<bibref bibrefs="chattopadhyay2017counting,cohen2005triumph" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. Transformations in this domain reflect basic mathematical operations over the number of objects in an image. KiVA contains object addition <Math mode="inline" tex="(+1,+2)" text="open-interval@(+ 1, + 2)" xml:id="S3.SS2.p4.m1">
            <XMath>
              <XMDual>
                <XMApp>
                  <XMTok meaning="open-interval"/>
                  <XMRef idref="S3.SS2.p4.m1.1"/>
                  <XMRef idref="S3.SS2.p4.m1.2"/>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp xml:id="S3.SS2.p4.m1.1">
                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                    <XMTok meaning="1" role="NUMBER">1</XMTok>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                  <XMApp xml:id="S3.SS2.p4.m1.2">
                    <XMTok meaning="plus" role="ADDOP">+</XMTok>
                    <XMTok meaning="2" role="NUMBER">2</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math> and subtraction <Math mode="inline" tex="(-1,-2)" text="open-interval@(- 1, - 2)" xml:id="S3.SS2.p4.m2">
            <XMath>
              <XMDual>
                <XMApp>
                  <XMTok meaning="open-interval"/>
                  <XMRef idref="S3.SS2.p4.m2.1"/>
                  <XMRef idref="S3.SS2.p4.m2.2"/>
                </XMApp>
                <XMWrap>
                  <XMTok role="OPEN" stretchy="false">(</XMTok>
                  <XMApp xml:id="S3.SS2.p4.m2.1">
                    <XMTok meaning="minus" role="ADDOP">-</XMTok>
                    <XMTok meaning="1" role="NUMBER">1</XMTok>
                  </XMApp>
                  <XMTok role="PUNCT">,</XMTok>
                  <XMApp xml:id="S3.SS2.p4.m2.2">
                    <XMTok meaning="minus" role="ADDOP">-</XMTok>
                    <XMTok meaning="2" role="NUMBER">2</XMTok>
                  </XMApp>
                  <XMTok role="CLOSE" stretchy="false">)</XMTok>
                </XMWrap>
              </XMDual>
            </XMath>
          </Math>, whereas KiVA-adults includes multiplication <Math mode="inline" tex="(\times 2,\times 3)" xml:id="S3.SS2.p4.m3">
            <XMath>
              <XMTok role="OPEN" stretchy="false">(</XMTok>
              <XMTok meaning="times" role="MULOP">×</XMTok>
              <XMTok meaning="2" role="NUMBER">2</XMTok>
              <XMTok role="PUNCT">,</XMTok>
              <XMTok meaning="times" role="MULOP">×</XMTok>
              <XMTok meaning="3" role="NUMBER">3</XMTok>
              <XMTok role="CLOSE" stretchy="false">)</XMTok>
            </XMath>
          </Math> and division <Math mode="inline" tex="(\div 2,\div 3)" xml:id="S3.SS2.p4.m4">
            <XMath>
              <XMTok role="OPEN" stretchy="false">(</XMTok>
              <XMTok meaning="divide" name="div" role="MULOP">÷</XMTok>
              <XMTok meaning="2" role="NUMBER">2</XMTok>
              <XMTok role="PUNCT">,</XMTok>
              <XMTok meaning="divide" name="div" role="MULOP">÷</XMTok>
              <XMTok meaning="3" role="NUMBER">3</XMTok>
              <XMTok role="CLOSE" stretchy="false">)</XMTok>
            </XMath>
          </Math> as well. We restrict the number of objects in an input or output image to under 8.</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS2.p5">
        <p><text font="bold">Rotation.</text>
Mental rotation is the ability to recognize and map different views of the same object <cite class="ltx_citemacro_citep">(<bibref bibrefs="shepard1971mental" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. This is essential for object manipulation, spatial orientation and navigation <cite class="ltx_citemacro_citep">(<bibref bibrefs="pinto2008real" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. KiVA adapts from human psychometric studies (e.g., <cite class="ltx_citemacro_citep">(<bibref bibrefs="bodner1997purdue,quaiser2003mental" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>), featuring 2D rotation by <Math mode="inline" tex="90" text="90" xml:id="S3.SS2.p5.m1">
            <XMath>
              <XMTok meaning="90" role="NUMBER">90</XMTok>
            </XMath>
          </Math> degrees (clockwise or counterclockwise) or <Math mode="inline" tex="180" text="180" xml:id="S3.SS2.p5.m2">
            <XMath>
              <XMTok meaning="180" role="NUMBER">180</XMTok>
            </XMath>
          </Math> degrees. KiVA-adults also includes <Math mode="inline" tex="45" text="45" xml:id="S3.SS2.p5.m3">
            <XMath>
              <XMTok meaning="45" role="NUMBER">45</XMTok>
            </XMath>
          </Math>-degree and <Math mode="inline" tex="135" text="135" xml:id="S3.SS2.p5.m4">
            <XMath>
              <XMTok meaning="135" role="NUMBER">135</XMTok>
            </XMath>
          </Math>-degree rotations.</p>
      </para>
      <para class="ltx_noindent" xml:id="S3.SS2.p6">
        <p><text font="bold">Reflection.</text>
Reflection aids in appreciating object symmetry and chirality, essential for distinguishing left and right shoes or gloves, etc. Chiral objects cannot be rotated or translated to align with their reflections, making them non-superimposable <cite class="ltx_citemacro_citep">(<bibref bibrefs="lin2020visual" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. Chiral objects are reflected along the x-axis or y-axis <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> in KiVA and along both in KiVA-adults.
<!--  %****␣3_experiments.tex␣Line␣50␣**** --></p>
      </para>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S4">
    <tags>
      <tag>4</tag>
      <tag role="autoref">section 4</tag>
      <tag role="refnum">4</tag>
      <tag role="typerefnum">§4</tag>
    </tags>
    <title><tag close=" ">4</tag>Comparing Analogical Reasoning in LMMs and Humans</title>
    <para class="ltx_noindent" xml:id="S4.p1">
      <p><text font="bold">Evaluating Large Multimodal Models.</text>
We test several LMMs: 1) GPT-o1 (o1-2024-12-17), 2) GPT4-V (gpt-4-vision-preview) <cite class="ltx_citemacro_citep">(<bibref bibrefs="GPT-4V" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, 3) LLaVA-1.5 <cite class="ltx_citemacro_citep">(<bibref bibrefs="liu2024visual" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>: an open-source model that integrates a vision encoder with a language model, specifically designed to enhance general-purpose visual and language understanding, 4) MANTIS <cite class="ltx_citemacro_citep">(<bibref bibrefs="jiang2024mantis" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> which builds on modified architectures from notable models like LLaVA to support interleaved multi-image input.
We combine the given transformation with the three choices of new object transformations at the extrapolation step into a single composite image for LLaVA-1.5 (limited to processing a single image), but present the given transformation and three choice transformations as four separate images to MANTIS and the GPT models as proposed in <cite class="ltx_citemacro_cite"><bibref bibrefs="campbell2025understanding" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
            <bibrefphrase>(</bibrefphrase>
            <bibrefphrase>)</bibrefphrase>
          </bibref></cite> to reduce the chance of visual binding errors. For all models, the temperature is set to 1 and the maximum token size is set to 300 (no cap for GPT-o1).
We randomize each experiment over three seeds and run each trial (Figure <ref labelref="LABEL:fig:pipeline"/>) on a model three times with test choices shuffled in order. We score correct choices as 1 and incorrect choices as 0. We calculate the mean score across its three seeds. To evaluate the performance per transformation domain, we calculate the overall mean and standard error for the average scores of all trials. GPT-o1, GPT-4V, LLaVA-1.5, and MANTIS complete the entire KiVA and KiVA-adult benchmarks. Open-source models ran on an A6000 48 GB single GPU for under 12 hours.</p>
    </para>
    <para class="ltx_noindent" xml:id="S4.p2">
      <p><text font="bold">Evaluating Humans.</text>
A corresponding visual analogies task, developed using JsPsych <cite class="ltx_citemacro_citep">(<bibref bibrefs="de2015jspsych" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>, was administered to two groups of human participants. All methods were approved by IRB (protocol 2020-10-13755) prior to testing both child and adult participants.
We recruited 250 adults (21 to 40 years old) on Prolific <cite class="ltx_citemacro_citep">(<bibref bibrefs="prolific" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> to complete the benchmark such that every trial was annotated by 3-13 adults. We recruited 42 children (aged 3 to 5 years, <Math mode="inline" tex="mean" text="m * e * a * n" xml:id="S4.p2.m1">
          <XMath>
            <XMApp>
              <XMTok meaning="times" role="MULOP">⁢</XMTok>
              <XMTok font="italic" role="UNKNOWN">m</XMTok>
              <XMTok font="italic" role="UNKNOWN">e</XMTok>
              <XMTok font="italic" role="UNKNOWN">a</XMTok>
              <XMTok font="italic" role="UNKNOWN">n</XMTok>
            </XMApp>
          </XMath>
        </Math> = 4.07 years, <Math mode="inline" tex="se" text="s * e" xml:id="S4.p2.m2">
          <XMath>
            <XMApp>
              <XMTok meaning="times" role="MULOP">⁢</XMTok>
              <XMTok font="italic" role="UNKNOWN">s</XMTok>
              <XMTok font="italic" role="UNKNOWN">e</XMTok>
            </XMApp>
          </XMath>
        </Math> = 0.11 years) from early childhood centers and ChildrenHelpingScience <cite class="ltx_citemacro_citep">(<bibref bibrefs="childrenhelpingscience" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite> to complete a random subset of 10 trials (2 trials per transformation domains), totaling 420 responses. We evaluated an additional 10 children and 40 adults on KiVA-adults and found that none of the children performed better than chance. All participants completed a practice trial with an “unrelated” transformation (adding a dot to geometric shapes) and received feedback to ensure understanding. Participants who failed within three attempts were excluded. Those who succeeded proceeded to test trials without feedback, and were told that rewards depended on their performance. Adults were paid at least $12/hour with a bonus of $0.01 per correct response, while children received stickers based on their performance.</p>
    </para>
    <subsection inlist="toc" xml:id="S4.SS1">
      <tags>
        <tag>4.1</tag>
        <tag role="autoref">subsection 4.1</tag>
        <tag role="refnum">4.1</tag>
        <tag role="typerefnum">§4.1</tag>
      </tags>
      <title><tag close=" ">4.1</tag>Results</title>
      <figure inlist="lof" labels="LABEL:fig:result LABEL:fig:simple-child LABEL:fig:simple-full" placement="b" xml:id="S4.F4">
        <tags>
          <tag><text fontsize="90%">Figure 4</text></tag>
          <tag role="autoref">Figure 4</tag>
          <tag role="refnum">4</tag>
          <tag role="typerefnum">Figure 4</tag>
        </tags>
        <figure align="center" placement="b" xml:id="S4.F4.fig1">
          <graphics candidates="figures/figure4_data.pdf" class="ltx_centering" graphic="figures/figure4_data.pdf" options="width=433.62pt" xml:id="S4.F4.g1"/>
        </figure>
        <figure align="center" placement="b" xml:id="S4.F4.fig2">
          <graphics candidates="figures/figure5_data.pdf" class="ltx_centering" graphic="figures/figure5_data.pdf" options="width=433.62pt" xml:id="S4.F4.g2"/>
        </figure>
        <toccaption class="ltx_centering"><tag close=" ">4</tag><text font="bold">Human and model performance in <text font="italic">KiVA</text> sorted by Transformation Domain and color coded by Question Type</text> in samples annotated by children (top figure) and in the full benchmark annotated by adults (bottom figure). Error bars represent standard errors across object variations. Chance level is <Math mode="inline" tex="25\%" text="25percent" xml:id="S4.F4.m1">
            <XMath>
              <XMApp>
                <XMTok meaning="percent" role="POSTFIX">%</XMTok>
                <XMTok meaning="25" role="NUMBER">25</XMTok>
              </XMApp>
            </XMath>
          </Math> for Verbal Classification; <Math mode="inline" tex="33\%" text="33percent" xml:id="S4.F4.m2">
            <XMath>
              <XMApp>
                <XMTok meaning="percent" role="POSTFIX">%</XMTok>
                <XMTok meaning="33" role="NUMBER">33</XMTok>
              </XMApp>
            </XMath>
          </Math> for Verbal Specification and Visual Extrapolation.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 4</text></tag><text font="bold" fontsize="90%">Human and model performance in <text font="italic">KiVA</text> sorted by Transformation Domain and color coded by Question Type<text font="medium"> in samples annotated by children (top figure) and in the full benchmark annotated by adults (bottom figure). Error bars represent standard errors across object variations. Chance level is <Math mode="inline" tex="25\%" text="25percent" xml:id="S4.F4.m3">
                <XMath>
                  <XMApp>
                    <XMTok meaning="percent" role="POSTFIX">%</XMTok>
                    <XMTok meaning="25" role="NUMBER">25</XMTok>
                  </XMApp>
                </XMath>
              </Math> for Verbal Classification; <Math mode="inline" tex="33\%" text="33percent" xml:id="S4.F4.m4">
                <XMath>
                  <XMApp>
                    <XMTok meaning="percent" role="POSTFIX">%</XMTok>
                    <XMTok meaning="33" role="NUMBER">33</XMTok>
                  </XMApp>
                </XMath>
              </Math> for Verbal Specification and Visual Extrapolation.</text></text></caption>
      </figure>
      <figure inlist="lof" labels="LABEL:fig:hard-adults-models" placement="t" xml:id="S4.F5">
        <tags>
          <tag><text fontsize="90%">Figure 5</text></tag>
          <tag role="autoref">Figure 5</tag>
          <tag role="refnum">5</tag>
          <tag role="typerefnum">Figure 5</tag>
        </tags>
<!--  %****␣4_analysis.tex␣Line␣25␣**** -->        <graphics candidates="figures/figure5_kivaadults.pdf" class="ltx_centering" graphic="figures/figure5_kivaadults.pdf" options="width=433.62pt" xml:id="S4.F5.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">5</tag><text font="bold">Adult and model performance in <text font="italic">KiVA-adults</text> sorted by Transformation Domain and color coded by Question Type</text>. Error bars and chance levels are as described in Figure <ref labelref="LABEL:fig:result"/>.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 5</text></tag><text font="bold" fontsize="90%">Adult and model performance in <text font="italic">KiVA-adults</text> sorted by Transformation Domain and color coded by Question Type<text font="medium">. Error bars and chance levels are as described in Figure <ref labelref="LABEL:fig:result"/>.</text></text></caption>
      </figure>
      <para class="ltx_noindent" xml:id="S4.SS1.p1">
        <p><text font="bold">Models get worse with increasing reasoning complexity from verbal description to visual extrapolation, unlike humans.</text>
Overall, LMMs can detect transformations and identify the general visual domain of the transformations (e.g., color vs. size), as indicated by the blue bars labeled “Verbal Classification” in Figure <ref labelref="LABEL:fig:result"/> for KiVA and in Figure <ref labelref="LABEL:fig:hard-adults-models"/> for KiVA-adults. In KiVA, GPT-o1, GPT-4V and MANTIS even outperform children in categorizing rotation and color changes. However, performance generally declines when the models are asked to further specify the transformation within the correctly identified visual domain (e.g., rotating 90 degrees or 180 degrees if spatial orientation is the correctly identified domain), as reflected by the orange bars labeled “Verbal Specification.” Performance for visual extrapolation declines even more, as illustrated by the green bars labeled “Visual Extrapolation.” In other words, models’ success in verbally describing transformations does not guarantee their success in extrapolation. Part of the models’ failure in analogical reasoning is an inability to correctly recognize the given transformation. Another part of the model’s failure lies in extrapolating the correctly identified transformation to a novel object and predicting the corresponding outcome. Even when given the correct verbal specification of the transformation, models still fail to solve extrapolation in different visual domains (Appendix <ref labelref="LABEL:sec:given"/>). By contrast, even young children tested in KiVA can verbally describe the transformations as reflected by their significantly-above-chance performance in verbal classification and verbal specification, and can then use their selected verbal descriptions to extrapolate the visual transformations to new objects. Adults show near-perfect performance from verbal classification to visual extrapolation in both KiVA and KiVA-adults.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:responsetimes" placement="b" xml:id="S4.F6">
        <tags>
          <tag><text fontsize="90%">Figure 6</text></tag>
          <tag role="autoref">Figure 6</tag>
          <tag role="refnum">6</tag>
          <tag role="typerefnum">Figure 6</tag>
        </tags>
        <graphics candidates="figures/Corr.pdf" class="ltx_centering" graphic="figures/Corr.pdf" options="width=433.62pt" xml:id="S4.F6.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">6</tag><text font="bold">Positive correlations between mean error scores of GPT-o1 and mean error score of children (left) and mean response times of adults (right) in KiVA visual extrapolation.</text></toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 6</text></tag><text font="bold" fontsize="90%">Positive correlations between mean error scores of GPT-o1 and mean error score of children (left) and mean response times of adults (right) in KiVA visual extrapolation.</text></caption>
      </figure>
      <para class="ltx_noindent" xml:id="S4.SS1.p2">
        <p><text font="bold">Model performance depends on the visual domain and correlates with human performance.</text>
Overall, models are better at classifying and describing color and size transformations than transformations in other domains (Figures <ref labelref="LABEL:fig:result"/> and <ref labelref="LABEL:fig:hard-adults-models"/>), which involve more discrete and local processing than the other domains <cite class="ltx_citemacro_citep">(<bibref bibrefs="zeki1991direct,zeng2020visual" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. In KiVA-adults, the best-performing model GPT-o1 nears adult performance only in the color domain (Figure <ref labelref="LABEL:fig:hard-adults-models"/>). Models are less able to specify what changed within the visual domains of rotation, reflection, and number and consequently also did not perform well in extrapolations for those domains. In contrast, children and adults generally show similar performance across visual domains, with children performing slightly worse on rotation compared to other domains. Children’s error scores (1-Accuracy) and adults’ response times correlate with GPT-o1’s error scores in the visual extrapolation of KiVA, as demonstrated in Figure <ref labelref="LABEL:fig:responsetimes"/>.
What is cognitively demanding to humans is also more computationally challenging for GPT-o1.</p>
      </para>
      <para class="ltx_noindent" xml:id="S4.SS1.p3">
        <p><text font="bold">Models hallucinate where there is no change.</text>
For each type of transformation, we randomly sample 10% positive transformation trials, and reassign transformations that involve no change. Only GPT-o1 correctly selects "no change" in both classification and specification across all visual domains, though it struggles to extrapolate this to new objects when distractors involve reflection or number change (Figure <ref labelref="LABEL:fig:no-change"/>). GPT-4V only accurately identifies "no change" in the verbal classification stage in the size domain. That said, when it does classify a trial as having no change, it consistently specifies that no change is involved (as reflected by the tall orange bars). In contrast, LLaVA-1.5 and MANTIS "hallucinate" a change in 100% of the no-change trials during verbal classification; although they can visually extrapolate the absence of change to some new objects, they are no better than chance.
<!--  %****␣4_analysis.tex␣Line␣50␣**** --></p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:no-change" xml:id="S4.F7">
        <tags>
          <tag><text fontsize="90%">Figure 7</text></tag>
          <tag role="autoref">Figure 7</tag>
          <tag role="refnum">7</tag>
          <tag role="typerefnum">Figure 7</tag>
        </tags>
        <graphics candidates="figures/NoChange.pdf" class="ltx_centering" graphic="figures/NoChange.pdf" options="width=433.62pt, trim=0 0.2cm 0 0.3cm, clip=true" xml:id="S4.F7.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">7</tag><text font="bold">Model performance on trials involving no change.</text> Error bars and chance levels are as described in Figure <ref labelref="LABEL:fig:result"/>.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 7</text></tag><text font="bold" fontsize="90%">Model performance on trials involving no change.<text font="medium"> Error bars and chance levels are as described in Figure <ref labelref="LABEL:fig:result"/>.</text></text></caption>
      </figure>
      <para class="ltx_noindent" xml:id="S4.SS1.p4">
        <p><text font="bold">Models are inconsistent within the same trials and across reasoning steps.</text>
We measured model choice inconsistency by quantifying how often a model selects different responses in identical repeated trials in KiVA (Figure <ref labelref="LABEL:fig:consistencywithin"/>). Models are the most consistent in Verbal Classification and least consistent in Visual Extrapolation, particularly when reasoning about number, rotation and reflection. GPT-o1 and GPT-4V, but not LLaVA-1.5 and MANTIS, show higher extrapolation performance when they can verbally identify the transformation (Appendix <ref labelref="LABEL:sec:conditional"/>). When models are given the correct verbal specification in their weaker domains (number, rotation and reflection), they still fail visual extrapolation (Appendix <ref labelref="LABEL:sec:given"/>). This underscores a key limitation in the visual analogical reasoning of LMMs: knowing the correct rule does not reliably translate to extending that rule to a new context.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:consistencywithin" placement="b" xml:id="S4.F8">
        <tags>
          <tag><text fontsize="90%">Figure 8</text></tag>
          <tag role="autoref">Figure 8</tag>
          <tag role="refnum">8</tag>
          <tag role="typerefnum">Figure 8</tag>
        </tags>
        <graphics candidates="figures/ConsistencyWithin.pdf" class="ltx_centering" graphic="figures/ConsistencyWithin.pdf" options="width=433.62pt" xml:id="S4.F8.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">8</tag><text font="bold">Proportion of model consistent responses within repeated trials.</text> Each model was evaluated on how many times out of the three repeated trials they did not select the same choice. The heat map shows choice inconsistency broken down by model, visual domain, and question type.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 8</text></tag><text font="bold" fontsize="90%">Proportion of model consistent responses within repeated trials.<text font="medium"> Each model was evaluated on how many times out of the three repeated trials they did not select the same choice. The heat map shows choice inconsistency broken down by model, visual domain, and question type.</text></text></caption>
      </figure>
      <para class="ltx_noindent" xml:id="S4.SS1.p5">
        <p><text font="bold">Verbal questions facilitate visual extrapolation in humans but the effects are less clear in GPT-o1.</text>
<!--  %****␣4_analysis.tex␣Line␣75␣**** -->We included verbal questions to reveal the step in the reasoning process where models might fail when making visual analogies. To assess the effects of verbal questions on subsequent visual extrapolation, we tested another 200 adults, 20 children and the best-performing model, GPT-o1, on a visual-extrapolation-only version of KiVA, removing the verbal questions to replicate previous visual analogy benchmarks. We focused on testing the three more challenging visual domains of KiVA, number, rotation and reflection.Without verbal questions, adults demonstrated similar accuracy but significantly slower response times, whereas children performed worse in extrapolation. The effects are less clear for GPT-o1: it performed equally well in the number domain, it is better at extrapolating object rotations but worse at extrapolating reflections when asked to reason about what changed and how it changed beforehand (Figure <ref labelref="LABEL:fig:extra-only"/>). While our verbal questions facilitate humans’ visual extrapolation performance, it is possible that reasoning models like GPT-o1 already reason about “what changed” and “how it changed” independently of our verbal queries. Future work should further explore the effects of guiding questions and chain-of-thought on reasoning models. At the same time, it may also be possible to solve KiVA without language, as in the case of a Large Vision Model <cite class="ltx_citemacro_citep">(<bibref bibrefs="bai2024sequential" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> that is trained in the complete absence of linguistic data (see Section <ref labelref="LABEL:sec:LVM"/>).</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:extra-only" placement="t" xml:id="S4.F9">
        <tags>
          <tag><text fontsize="90%">Figure 9</text></tag>
          <tag role="autoref">Figure 9</tag>
          <tag role="refnum">9</tag>
          <tag role="typerefnum">Figure 9</tag>
        </tags>
        <graphics candidates="figures/VE.pdf" class="ltx_centering" graphic="figures/VE.pdf" options="width=433.62pt" xml:id="S4.F9.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">9</tag><text font="bold">Adults’ Mean Response Times, Children’s and GPT-o1’s Mean Accuracy in Visual Extrapolation with and without the three-step query.</text></toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 9</text></tag><text font="bold" fontsize="90%">Adults’ Mean Response Times, Children’s and GPT-o1’s Mean Accuracy in Visual Extrapolation with and without the three-step query.</text></caption>
      </figure>
      <para class="ltx_noindent" xml:id="S4.SS1.p6">
        <p><text font="bold">In-context learning and prompt engineering did not improve model performance.</text>
We explore whether model performance improves through careful prompt engineering (Appendix <ref labelref="LABEL:sec:appdx_prompts"/>), which has shown promising results on various tasks  <cite class="ltx_citemacro_citep">(<bibref bibrefs="wei2022chain,qin2021learning" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. We consider four different prompt engineering methods: <text font="italic">1) Reasoning through code</text> <cite class="ltx_citemacro_citep">(<bibref bibrefs="sharma2024vision" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>: We first prompt the model to generate code snippets describing each transformation in the task, then rephrase the task question to incorporate the generated code. <text font="italic">2) Reasoning after Reflection</text> <cite class="ltx_citemacro_citep">(<bibref bibrefs="valmeekam2023can" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>: We ask the model to reflect on its answers two times for each question in the task. <text font="italic">3) Reasoning through instruction</text>: inspired by <cite class="ltx_citemacro_cite"><bibref bibrefs="wei2022chain" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>, which shows that chain-of-thought reasoning is more effective on several benchmarks, we prompt the model to generate step-by-step instructions on how to answer each question, then use the instructions to generate an answer. <text font="italic">4) In-Context Learning</text> <cite class="ltx_citemacro_citep">(<bibref bibrefs="dong2022survey" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>: We give the model two randomly sampled examples with solutions for each concept before displaying the task. Apart from text prompt engineering, we experiment with different visual prompting for LLaVA-1.5. Recent works <cite class="ltx_citemacro_citep">(<bibref bibrefs="lvm,bar2022visual,painter" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> show that visual model performance is sensitive to the alterations in color and size of the visual input. We apply two visual prompting approaches: <text font="italic">1) Color</text>: we alter the image background color (initially transparent) into black and white <cite class="ltx_citemacro_citep">(<bibref bibrefs="lvm" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. <text font="italic">2) Size</text>: we apply a center crop to the images, varying the image size between 0.9 and 1. None of these approaches improve performance, which points to the challenging nature of our benchmark.</p>
      </para>
    </subsection>
    <subsection inlist="toc" xml:id="S4.SS2">
      <tags>
        <tag>4.2</tag>
        <tag role="autoref">subsection 4.2</tag>
        <tag role="refnum">4.2</tag>
        <tag role="typerefnum">§4.2</tag>
      </tags>
      <title><tag close=" ">4.2</tag>Discussion</title>
      <para xml:id="S4.SS2.p1">
        <p>Despite extensive training on image and text data, GPT-o1, GPT-4V, LLaVA-1.5, and MANTIS still cannot reason about spatial and numerical visual analogies like young children can. Although GPT‑o1 outperforms the other models, it falls short of child performance in reflection and number domains in KiVA and remains far from adult performance—except in the color domain of KiVA‑adults. Moreover, model performance declines markedly from verbal description to visual extrapolation, unlike human performance, where even correct transformation recognition does not guarantee successful extrapolation to a new object. This points to a fundamental challenge: mapping a transformation from a source object to a target while preserving relational structure <cite class="ltx_citemacro_citep">(<bibref bibrefs="gentner1983structure" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. Future research should explore how vision and language each contribute to visual analogical reasoning.</p>
      </para>
      <para xml:id="S4.SS2.p2">
        <p>Human perception of feature-level changes like color or size is relatively straightforward, whereas appreciating reflection, rotation, and numerical changes requires active engagement, sequential tracking, and mental manipulation. Our findings align with prior studies showing that LMMs struggle with spatial reasoning <cite class="ltx_citemacro_citep">(<bibref bibrefs="rahmanzadehgervi2024vision,wang2024picture,wu2024surprising" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite> and counting <cite class="ltx_citemacro_citep">(<bibref bibrefs="jiang2024effectiveness,rahmanzadehgervi2024vision" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. For humans, size and color changes are processed earlier in the visual pathway and in development <cite class="ltx_citemacro_citep">(<bibref bibrefs="zeki1991direct,zeng2020visual,day1981infant,milewski1975discrimination,ross2003development,slater1990size" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, while spatial and numerical changes are more cognitively demanding. Although this convergence in performance between LMMs and human children does not imply that they are built or function identically, it is intriguing that similar trends emerge from such fundamentally different systems.</p>
      </para>
      <para xml:id="S4.SS2.p3">
        <p>KiVA is designed to assess visual change detection and analogical reasoning—the kinds of skills that children as young as three demonstrate. Our results show that LMMs underperform compared to humans, even with in-context learning and prompting, and future improvements may require approaches such as symbolic visual vocabularies and Bayesian inference <cite class="ltx_citemacro_citep">(<bibref bibrefs="depeweg2024solving" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>.</p>
      </para>
    </subsection>
  </section>
  <section inlist="toc" xml:id="S5">
    <tags>
      <tag>5</tag>
      <tag role="autoref">section 5</tag>
      <tag role="refnum">5</tag>
      <tag role="typerefnum">§5</tag>
    </tags>
    <title><tag close=" ">5</tag>Conclusion</title>
    <para xml:id="S5.p1">
      <p>Overall, large multimodal models remain less capable than humans at visual analogical reasoning. They can classify changes in images, but their ability to specify and extrapolate these changes to novel objects diminishes sharply. GPT‑o1 performs best—especially for color and size, which are surface features—but struggles with spatial and numerical analogies that likely require a deeper understanding of the 3D world. In contrast, humans excel at interpreting diverse object relations and transformations <cite class="ltx_citemacro_citep">(<bibref bibrefs="goddu2020transformations,mitchell2023comparing" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
            <bibrefphrase>, </bibrefphrase>
          </bibref>)</cite>.</p>
    </para>
    <para xml:id="S5.p2">
      <p>As models improve, our extended benchmarks (KiVA‑adults and KiVA‑compositionality) will probe more advanced analogical reasoning. So far, GPT‑o1 only reaches adult-level reasoning in the color domain, highlighting the need for further research into the complexities of visual cognition.</p>
    </para>
    <subsubsection xml:id="S5.SS0.SSSx1">
      <title>Acknowledgments</title>
      <para xml:id="S5.SS0.SSSx1.p1">
        <p>We are grateful to Joe Heyward, Viorica Patraucean, Mariel Goddu, Jefferson Ortega and the participants of the AI, Psychology, and Neuroscience workshop at the Simons Institute for discussion, to participants and their parents, local early childhood centers, and UC Berkeley undergrads who assisted in human data collection: Alexis Davis, Janna Umagat, Kate Choi, Kaydee Manikhong, Linda Marie Trevino, Nitya Sriram, Nora Chen, Ray Huang, Shivalika Jhabua and Weiyin Gao. This project was supported by Meta-BAIR Commons, CIFAR Catalyst Award: Causally guided exploration in children &amp; AI, and ONR MURI Self-Learning Perception Through Real-World Interaction.</p>
      </para>
    </subsubsection>
  </section>
  <bibliography citestyle="authoryear" files="iclr2025_conference" xml:id="bib">
    <title>References</title>
  </bibliography>
<!--  %****␣iclr2025_conference.tex␣Line␣75␣**** -->  <appendix inlist="toc" labels="LABEL:sec:appdx_prompts" xml:id="A1">
    <tags>
      <tag>Appendix A</tag>
      <tag role="autoref">Appendix A</tag>
      <tag role="refnum">A</tag>
      <tag role="typerefnum">Appendix A</tag>
    </tags>
    <title><tag close=" ">Appendix A</tag>Visual Analogical Reasoning Prompts</title>
    <toctitle><tag close=" ">A</tag>Visual Analogical Reasoning Prompts</toctitle>
    <subsection inlist="toc" labels="LABEL:sec:Visual_Extrapolation" xml:id="A1.SS1">
      <tags>
        <tag>A.1</tag>
        <tag role="autoref">subsection A.1</tag>
        <tag role="refnum">A.1</tag>
        <tag role="typerefnum">§A.1</tag>
      </tags>
      <title><tag close=" ">A.1</tag>Stitched visual extrapolation examples for each domain</title>
      <para xml:id="A1.SS1.p1">
        <p><text font="bold">Visual Extrapolation.</text> As the final step of the querying process, we presented an image of a new object and ask the model to predict what the object will look like if it goes through the same change as the given transformation.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:combined_extrapolation_examples" placement="H" xml:id="A1.F10">
        <tags>
          <tag><text fontsize="90%">Figure 10</text></tag>
          <tag role="autoref">Figure 10</tag>
          <tag role="refnum">10</tag>
          <tag role="typerefnum">Figure 10</tag>
        </tags>
        <figure align="center" inlist="lof" labels="LABEL:fig:testegReflect" placement="b" xml:id="A1.F9.sf1">
          <tags>
            <tag><text fontsize="90%">(a)</text></tag>
            <tag role="autoref">(a)</tag>
            <tag role="refnum">9(a)</tag>
          </tags>
          <graphics candidates="figures/StitchedExtrapolationEgReflect.jpg" class="ltx_centering" graphic="figures/StitchedExtrapolationEgReflect.jpg" options="width=433.62pt" xml:id="A1.F9.sf1.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(a)</tag>Example of a visual extrapolation trial involving a <text font="bold">reflection</text>.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(a)</text></tag><text fontsize="90%">Example of a visual extrapolation trial involving a <text font="bold">reflection</text>.</text></caption>
        </figure>
        <figure align="center" inlist="lof" labels="LABEL:fig:testeg2DRotation" placement="b" xml:id="A1.F9.sf2">
          <tags>
            <tag><text fontsize="90%">(b)</text></tag>
            <tag role="autoref">(b)</tag>
            <tag role="refnum">9(b)</tag>
          </tags>
          <graphics candidates="figures/StitchedExtrapolationEg2DRotation.jpg" class="ltx_centering" graphic="figures/StitchedExtrapolationEg2DRotation.jpg" options="width=433.62pt" xml:id="A1.F9.sf2.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(b)</tag>Example of a visual extrapolation trial involving an angular <text font="bold">rotation</text>.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(b)</text></tag><text fontsize="90%">Example of a visual extrapolation trial involving an angular <text font="bold">rotation</text>.</text></caption>
        </figure>
        <figure align="center" inlist="lof" labels="LABEL:fig:testegResize" placement="b" xml:id="A1.F9.sf3">
          <tags>
            <tag><text fontsize="90%">(c)</text></tag>
            <tag role="autoref">(c)</tag>
            <tag role="refnum">9(c)</tag>
          </tags>
<!--  %****␣Appendix.tex␣Line␣25␣**** -->          <graphics candidates="figures/StitchedExtrapolationEgSize.jpg" class="ltx_centering" graphic="figures/StitchedExtrapolationEgSize.jpg" options="width=433.62pt" xml:id="A1.F9.sf3.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(c)</tag>Example of a visual extrapolation trial involving a <text font="bold">size change</text>.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(c)</text></tag><text fontsize="90%">Example of a visual extrapolation trial involving a <text font="bold">size change</text>.</text></caption>
        </figure>
        <figure align="center" inlist="lof" labels="LABEL:fig:testegCounting" placement="b" xml:id="A1.F9.sf4">
          <tags>
            <tag><text fontsize="90%">(d)</text></tag>
            <tag role="autoref">(d)</tag>
            <tag role="refnum">9(d)</tag>
          </tags>
          <graphics candidates="figures/StitchedExtrapolationEgCounting.jpg" class="ltx_centering" graphic="figures/StitchedExtrapolationEgCounting.jpg" options="width=433.62pt" xml:id="A1.F9.sf4.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(d)</tag>Example of a visual extrapolation trial involving a <text font="bold">number change</text>.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(d)</text></tag><text fontsize="90%">Example of a visual extrapolation trial involving a <text font="bold">number change</text>.</text></caption>
        </figure>
        <figure align="center" inlist="lof" labels="LABEL:fig:testegColour" placement="b" xml:id="A1.F9.sf5">
          <tags>
            <tag><text fontsize="90%">(e)</text></tag>
            <tag role="autoref">(e)</tag>
            <tag role="refnum">9(e)</tag>
          </tags>
          <graphics candidates="figures/StitchedExtrapolationEgColour.jpg" class="ltx_centering" graphic="figures/StitchedExtrapolationEgColour.jpg" options="width=433.62pt" xml:id="A1.F9.sf5.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(e)</tag>Example of a visual extrapolation trial involving a <text font="bold">color change</text>.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(e)</text></tag><text fontsize="90%">Example of a visual extrapolation trial involving a <text font="bold">color change</text>.</text></caption>
        </figure>
        <toccaption class="ltx_centering"><tag close=" ">10</tag>Examples of visual extrapolation trials for different transformations.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 10</text></tag><text fontsize="90%">Examples of visual extrapolation trials for different transformations.</text></caption>
      </figure>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:prompteg" xml:id="A1.SS2">
      <tags>
        <tag>A.2</tag>
        <tag role="autoref">subsection A.2</tag>
        <tag role="refnum">A.2</tag>
        <tag role="typerefnum">§A.2</tag>
      </tags>
      <title><tag close=" ">A.2</tag>Prompting of Models and Human Adults</title>
<!--  %****␣Appendix.tex␣Line␣50␣**** -->      <para xml:id="A1.SS2.p1">
        <p>We first include a system prompt to orient the models for visual analogical reasoning.
<text font="italic">You are an excellent visual puzzle solver! You will be given a visual puzzle that requires using visual analogical reasoning.</text>
For models, we include a chain-of-thought prompt.
<text font="italic">You will think "step-by-step" and carefully examine the visual evidence before providing an answer.</text>
For human adults, we additionally include the following prompt to motivate their participation. <text font="italic">At the end of the experiment, you will see the total number of correct answers you provided. Each correct answer will convert to $0.01 additional compensation for your study participation.</text>
Then we provide an initial instruction prompt:
<text font="italic">You are given a visual puzzle. The puzzle features a left-to-right transformation of an object on top and three left-to-right transformations of a different object on the bottom marked by (A) or (B) or (C). The transformations involve a change of either the size, orientation, number, or color of an object.</text></p>
      </para>
      <para xml:id="A1.SS2.p2">
        <enumerate xml:id="A1.I1">
          <item xml:id="A1.I1.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="A1.I1.i1.p1">
              <p><text font="bold">Verbal Classification (“<text font="italic">what</text>”).</text></p>
            </para>
            <para xml:id="A1.I1.i1.p2">
              <p><text font="italic">“Which one of the following rules  best describes the left-to-right transformation on top of the puzzle where the picture on the left transforms to the picture on the right? Answer with the correct rule number. Surrounded by parentheses, then provide a "step-by-step" reasoning for your choice."</text></p>
            </para>
          </item>
          <item xml:id="A1.I1.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="A1.I1.i2.p1">
              <p><text font="bold">Verbal Specification (“<text font="italic">how</text>”).</text></p>
            </para>
            <para xml:id="A1.I1.i2.p2">
              <p><text font="italic">“Which one of the following rules  best describes the left-to-right transformation in the top of the puzzle where the picture on the left transforms to the picture on the right?. Answer with the correct rule number surrounded by parentheses. Then provide a "step-by-step" reasoning for your choice."</text></p>
            </para>
          </item>
          <item xml:id="A1.I1.i3">
            <tags>
              <tag>3.</tag>
              <tag role="autoref">item 3</tag>
              <tag role="refnum">3</tag>
              <tag role="typerefnum">item 3</tag>
            </tags>
            <para xml:id="A1.I1.i3.p1">
              <p><text font="bold">Visual Extrapolation.</text></p>
            </para>
            <para xml:id="A1.I1.i3.p2">
              <p><text font="italic">“Which one of the three left-to-right object transformations (marked by either (A), (B) or (C)) on the bottom of the puzzle is the same as the left-to-right transformation on the top of the puzzle? Answer with the correct letter surrounded by parentheses (or (D) if none of the options apply), then provide a a "step-by-step" reasoning for your choice."</text></p>
            </para>
          </item>
        </enumerate>
      </para>
<!--  %****␣Appendix.tex␣Line␣75␣**** -->    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:child-prompt" xml:id="A1.SS3">
      <tags>
        <tag>A.3</tag>
        <tag role="autoref">subsection A.3</tag>
        <tag role="refnum">A.3</tag>
        <tag role="typerefnum">§A.3</tag>
      </tags>
      <title><tag close=" ">A.3</tag>Prompting Human Children</title>
      <para xml:id="A1.SS3.p1">
        <p>All verbal instructions are read out loud to children by a human experimenter. We first provide a context to motivate children’s participation in the experiment.
<text font="italic">You are on a mission as a picture detective. You will see how different pictures change. Your job as a picture detective is to figure out how the pictures change, and to guess how a new picture would change based on that. These pictures can change in size, where they face, number, or color. Every time you answer correctly, you will get a coin. You won’t find out how many coins you get until the end of the game. At the end of the game, you will see the total number of coins you win. The more coins you get, the more stickers you win.</text></p>
      </para>
      <para xml:id="A1.SS3.p2">
        <enumerate xml:id="A1.I2">
          <item xml:id="A1.I2.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="A1.I2.i1.p1">
              <p><text font="bold">Verbal Classification (“<text font="italic">what</text>”).</text></p>
            </para>
            <para xml:id="A1.I2.i1.p2">
              <p><text font="italic">“Here are two pictures separated by a black line in the middle. The picture on the left turns into the picture on the right. Do you think there is a change? What do you think the change is?"</text></p>
            </para>
          </item>
          <item xml:id="A1.I2.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="A1.I2.i2.p1">
              <p><text font="bold">Verbal Specification (“<text font="italic">how</text>”).</text></p>
            </para>
            <para xml:id="A1.I2.i2.p2">
              <p><text font="italic">“Can you say more about the change from the left to the right?"</text></p>
            </para>
          </item>
          <item xml:id="A1.I2.i3">
            <tags>
              <tag>3.</tag>
              <tag role="autoref">item 3</tag>
              <tag role="refnum">3</tag>
              <tag role="typerefnum">item 3</tag>
            </tags>
            <para xml:id="A1.I2.i3.p1">
              <p><text font="bold">Visual Extrapolation.</text></p>
            </para>
            <para xml:id="A1.I2.i3.p2">
              <p><text font="italic">“Here is another picture that goes through the same change from the left to right. Can you find the box that shows the same change?"</text></p>
            </para>
          </item>
        </enumerate>
      </para>
      <para xml:id="A1.SS3.p3">
        <p>Note that the prompt used for children did not improve model or human adult performance.</p>
      </para>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:reflect" xml:id="A1.SS4">
      <tags>
        <tag>A.4</tag>
        <tag role="autoref">subsection A.4</tag>
        <tag role="refnum">A.4</tag>
        <tag role="typerefnum">§A.4</tag>
      </tags>
      <title><tag close=" ">A.4</tag>Prompting models through reflection and self-critique</title>
<!--  %****␣Appendix.tex␣Line␣100␣**** -->      <para xml:id="A1.SS4.p1">
        <enumerate xml:id="A1.I3">
          <item xml:id="A1.I3.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="A1.I3.i1.p1">
              <p><text font="bold">Verbal Classification (“<text font="italic">what</text>”).</text></p>
            </para>
            <para xml:id="A1.I3.i1.p2">
              <p><text font="italic">“Which one of the following rules  best describes the left-to-right transformation on top of the puzzle where the picture on the left transforms to the picture on the right? Answer with the correct rule number surrounded by parentheses, then provide a "step-by-step" reasoning for your choice. Please reflect on your answer and provide a revised response if necessary."</text></p>
            </para>
            <para xml:id="A1.I3.i1.p3">
              <p>(repeat three times following model output) <text font="italic">Start your response with your updated answer.</text></p>
            </para>
          </item>
          <item xml:id="A1.I3.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="A1.I3.i2.p1">
              <p><text font="bold">Verbal Specification (“<text font="italic">how</text>”).</text></p>
            </para>
            <para xml:id="A1.I3.i2.p2">
              <p><text font="italic">“Which one of the following rules  best describes the left-to-right transformation in the top of the puzzle where the picture on the left transforms to the picture on the right?. Answer with the correct rule number surrounded by parentheses, then provide a "step-by-step" reasoning for your choice. Please reflect on your answer and provide a revised response if necessary."</text></p>
            </para>
            <para xml:id="A1.I3.i2.p3">
              <p>(repeat three times following model output) <text font="italic">Start your response with your updated answer.</text></p>
            </para>
          </item>
          <item xml:id="A1.I3.i3">
            <tags>
              <tag>3.</tag>
              <tag role="autoref">item 3</tag>
              <tag role="refnum">3</tag>
              <tag role="typerefnum">item 3</tag>
            </tags>
            <para xml:id="A1.I3.i3.p1">
              <p><text font="bold">Visual Extrapolation.</text></p>
            </para>
            <para xml:id="A1.I3.i3.p2">
              <p><text font="italic">“Which one of three left-to-right object transformations (marked by either (A), (B) or (C)) on the bottom of the puzzle is the same as the left-to-right transformation on the top of the puzzle? Answer with the correct letter surrounded by parentheses (or (D) if none of the options apply), then provide a "step-by-step" reasoning for your choice. Please reflect on your answer and provide a revised response if necessary."</text></p>
            </para>
            <para xml:id="A1.I3.i3.p3">
              <p>(repeat three times following model output) <text font="italic">Start your response with your updated answer.</text></p>
            </para>
          </item>
        </enumerate>
      </para>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:instruct" xml:id="A1.SS5">
      <tags>
        <tag>A.5</tag>
        <tag role="autoref">subsection A.5</tag>
        <tag role="refnum">A.5</tag>
        <tag role="typerefnum">§A.5</tag>
      </tags>
      <title><tag close=" ">A.5</tag>Prompting models through instructions</title>
<!--  %****␣Appendix.tex␣Line␣125␣**** -->      <para xml:id="A1.SS5.p1">
        <enumerate xml:id="A1.I4">
          <item xml:id="A1.I4.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="A1.I4.i1.p1">
              <p><text font="bold">Verbal Classification (“<text font="italic">what</text>”).</text></p>
            </para>
            <para xml:id="A1.I4.i1.p2">
              <p><text font="italic">“Which one of the following rules  best describes the left-to-right transformation on top of the puzzle where the picture on the left transforms to the picture on the right? Answer with the correct rule number surrounded by parentheses, then provide a “step-by-step” reasoning for your choice."</text></p>
            </para>
          </item>
          <item xml:id="A1.I4.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="A1.I4.i2.p1">
              <p><text font="bold">Verbal Specification (“<text font="italic">how</text>”).</text></p>
            </para>
            <para xml:id="A1.I4.i2.p2">
              <p><text font="italic">“Provide brief instructions on how to establish if a transformation involves an object rotates 90 degrees or 180 degrees. Use the instructions form before to answer the following question: Which one of the following rules  best describes the transformation in the top of the puzzle where the picture on the left transforms to the picture on the right?. Answer with the correct rule number surrounded by parentheses, then provide a “step-by-step” reasoning for your choice."</text></p>
            </para>
          </item>
          <item xml:id="A1.I4.i3">
            <tags>
              <tag>3.</tag>
              <tag role="autoref">item 3</tag>
              <tag role="refnum">3</tag>
              <tag role="typerefnum">item 3</tag>
            </tags>
            <para xml:id="A1.I4.i3.p1">
              <p><text font="bold">Visual Extrapolation.</text></p>
            </para>
            <para xml:id="A1.I4.i3.p2">
              <p><text font="italic">“Provide brief instructions on how to determine which one of three left-to-right object transformations (marked by either (A), (B) or (C) ) on the bottom of the puzzle is the same as the left-to-right transformation on the top of the puzzle? Use the instructions from before to determine which one of three left-to-right object transformations (marked by either (A), (B) or (C) ) on the bottom of the puzzle is the same as the left-to-right transformation on the top of the puzzle? Answer with the correct letter surrounded by parentheses (or (D) if none of the options apply), then provide a step-by-step reasoning for your choice."</text></p>
            </para>
          </item>
        </enumerate>
      </para>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:code" xml:id="A1.SS6">
      <tags>
        <tag>A.6</tag>
        <tag role="autoref">subsection A.6</tag>
        <tag role="refnum">A.6</tag>
        <tag role="typerefnum">§A.6</tag>
      </tags>
      <title><tag close=" ">A.6</tag>Prompting models through code</title>
      <para xml:id="A1.SS6.p1">
        <enumerate xml:id="A1.I5">
          <item xml:id="A1.I5.i1">
            <tags>
              <tag>1.</tag>
              <tag role="autoref">item 1</tag>
              <tag role="refnum">1</tag>
              <tag role="typerefnum">item 1</tag>
            </tags>
            <para xml:id="A1.I5.i1.p1">
              <p><text font="bold">Verbal Classification (“<text font="italic">what</text>”).</text></p>
            </para>
            <para xml:id="A1.I5.i1.p2">
              <p><text font="italic">“Which one of the following rules  best describes the left-to-right transformation on top of the puzzle where the picture on the left transforms to the picture on the right? Answer with the correct rule number surrounded by parentheses, then provide a "step-by-step" reasoning for your choice."</text></p>
            </para>
          </item>
          <item xml:id="A1.I5.i2">
            <tags>
              <tag>2.</tag>
              <tag role="autoref">item 2</tag>
              <tag role="refnum">2</tag>
              <tag role="typerefnum">item 2</tag>
            </tags>
            <para xml:id="A1.I5.i2.p1">
              <p><text font="bold">Verbal Specification (“<text font="italic">how</text>”).</text></p>
            </para>
<!--  %****␣Appendix.tex␣Line␣150␣**** -->            <para xml:id="A1.I5.i2.p2">
              <p><text font="italic">“Generate python code using the package pillow that takes in the left image in the left-to-right transformation on top and outputs the right image. Denote this snippet as training snippet using the insights from the training code snippet, which one of the following rules  best describes the left-to-right transformation in the top of the puzzle where the picture on the left transforms to the picture on the right?. Answer with the correct rule number surrounded by parentheses, then provide a "step-by-step" reasoning for your choice."</text></p>
            </para>
          </item>
          <item xml:id="A1.I5.i3">
            <tags>
              <tag>3.</tag>
              <tag role="autoref">item 3</tag>
              <tag role="refnum">3</tag>
              <tag role="typerefnum">item 3</tag>
            </tags>
            <para xml:id="A1.I5.i3.p1">
              <p><text font="bold">Visual Extrapolation.</text></p>
            </para>
            <para xml:id="A1.I5.i3.p2">
              <p><text font="italic">“Generate a brief code snippet using python and the pillow package for each left-to-right transformation in the bottom. Each snippet takes in the left picture of the transformation and outputs the right one. Now Which one of three code snippets is the same as the training code snippet you have produced before. Answer with the correct snippet letter ((A) or (B) or (C)) surrounded by parentheses (or (D) if none of the options apply), then provide a "step-by-step" reasoning for your choice."</text></p>
            </para>
          </item>
        </enumerate>
      </para>
    </subsection>
  </appendix>
  <appendix inlist="toc" labels="LABEL:sec:extra-results" xml:id="A2">
    <tags>
      <tag>Appendix B</tag>
      <tag role="autoref">Appendix B</tag>
      <tag role="refnum">B</tag>
      <tag role="typerefnum">Appendix B</tag>
    </tags>
    <title><tag close=" ">Appendix B</tag>Additional model analyses</title>
    <toctitle><tag close=" ">B</tag>Additional model analyses</toctitle>
    <subsection inlist="toc" labels="LABEL:sec:multi-single" xml:id="A2.SS1">
      <tags>
        <tag>B.1</tag>
        <tag role="autoref">subsection B.1</tag>
        <tag role="refnum">B.1</tag>
        <tag role="typerefnum">§B.1</tag>
      </tags>
      <title><tag close=" ">B.1</tag>Effects of Multi-image versus Single-image presentation on GPT-o1’s Visual Extrapolation</title>
      <para xml:id="A2.SS1.p1">
        <p>We evaluate whether or not GPT-o1 does indeed benefit from muti-image presentation, in which the given transformation and the three test transformation options are provided to the model as four separate images, as opposed to combining everything into a single image, as described in <cite class="ltx_citemacro_citep">(<bibref bibrefs="campbell2025understanding" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>. GPT-o1 shows significantly better performance in visual extrapolation of color, size and number, but not for rotation and reflection (Figure <ref labelref="LABEL:fig:multisingle"/>), suggesting that challenge in the latter two domains goes beyond a visual binding problem described in <cite class="ltx_citemacro_cite"><bibref bibrefs="campbell2025understanding" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite>.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:multisingle" placement="b" xml:id="A2.F11">
        <tags>
          <tag><text fontsize="90%">Figure 11</text></tag>
          <tag role="autoref">Figure 11</tag>
          <tag role="refnum">11</tag>
          <tag role="typerefnum">Figure 11</tag>
        </tags>
        <graphics candidates="figures/MultivsSingle.pdf" class="ltx_centering" graphic="figures/MultivsSingle.pdf" options="width=303.534pt" xml:id="A2.F11.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">11</tag><text font="bold">Visual Extrapolation performance of GPT-o1 under Multi- versus Single-image presentations</text>.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 11</text></tag><text font="bold" fontsize="90%">Visual Extrapolation performance of GPT-o1 under Multi- versus Single-image presentations<text font="medium">.</text></text></caption>
      </figure>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:conditional" xml:id="A2.SS2">
      <tags>
        <tag>B.2</tag>
        <tag role="autoref">subsection B.2</tag>
        <tag role="refnum">B.2</tag>
        <tag role="typerefnum">§B.2</tag>
      </tags>
      <title><tag close=" ">B.2</tag>Models’ extrapolation performance based on previous verbal reasoning</title>
<!--  %****␣Appendix.tex␣Line␣175␣**** -->      <para xml:id="A2.SS2.p1">
        <p>Furthermore, we report models’ extrapolation performance <text font="italic">conditional</text> on succeeding (green) or failing (red) at the previous steps of verbal reasoning in Figure <ref labelref="LABEL:fig:correct-incorrect-verbal"/>. GPT-o1 exhibits significantly higher visual extrapolation accuracy when its preceding verbal reasoning is correct across all transformation domains, whereas GPT-4V shows this benefit only in the color and size domains. In other words, successful visual extrapolation is contingent on solving verbal classification or specification correctly when models are solving KiVA above chance level. Meanwhile, there is no conditional dependence of prior verbal reasoning on subsequent visual extrapolation in LLaVA-1.5 and MANTIS, and they also perform no better than chance level on KiVA.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:correct-incorrect-verbal" placement="t" xml:id="A2.F12">
        <tags>
          <tag><text fontsize="90%">Figure 12</text></tag>
          <tag role="autoref">Figure 12</tag>
          <tag role="refnum">12</tag>
          <tag role="typerefnum">Figure 12</tag>
        </tags>
        <graphics candidates="figures/CI.pdf" class="ltx_centering" graphic="figures/CI.pdf" options="width=433.62pt" xml:id="A2.F12.g1"/>
        <toccaption class="ltx_centering"><tag close=" ">12</tag><text font="bold">Visual Extrapolation performance of models following <text font="italic">Correct</text> and <text font="italic">Incorrect</text> verbal classification / specification, sorted by transformation domain.</text> Standard errors are in parentheses. (Note that verbal specification is only asked if verbal classification is correct.)</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 12</text></tag><text font="bold" fontsize="90%">Visual Extrapolation performance of models following <text font="italic">Correct</text> and <text font="italic">Incorrect</text> verbal classification / specification, sorted by transformation domain.<text font="medium"> Standard errors are in parentheses. (Note that verbal specification is only asked if verbal classification is correct.)</text></text></caption>
      </figure>
    </subsection>
    <subsection inlist="toc" labels="LABEL:sec:given" xml:id="A2.SS3">
      <tags>
        <tag>B.3</tag>
        <tag role="autoref">subsection B.3</tag>
        <tag role="refnum">B.3</tag>
        <tag role="typerefnum">§B.3</tag>
      </tags>
      <title><tag close=" ">B.3</tag>Models’performance when given correct previous verbal reasoning step</title>
      <para xml:id="A2.SS3.p1">
        <p>10% of transformation trials were randomly sampled to evaluate if model performance across the three weaker performing visual domains (number, rotation and reflection) would improve when given the correct answer to the previous reasoning step. In one experiment, we provided the correct verbal classification answer and evaluated models’ verbal specification (Figure <ref labelref="LABEL:fig:given-correct-cross"/>). In another experiment, we provided the correct verbal specification answer and evaluated models’ visual extrapolation (Figure <ref labelref="LABEL:fig:given-correct-within"/>). Overall, having the ground truth for the preceding verbal reasoning step did not guarantee much success in the subsequent verbal specification or visual extrapolation tasks.</p>
      </para>
      <figure inlist="lof" labels="LABEL:fig:given-correct" placement="b" xml:id="A2.F13">
        <tags>
          <tag><text fontsize="90%">Figure 13</text></tag>
          <tag role="autoref">Figure 13</tag>
          <tag role="refnum">13</tag>
          <tag role="typerefnum">Figure 13</tag>
        </tags>
        <figure align="center" inlist="lof" labels="LABEL:fig:given-correct-cross" placement="b" xml:id="A2.F12.sf1">
          <tags>
            <tag><text fontsize="90%">(a)</text></tag>
            <tag role="autoref">(a)</tag>
            <tag role="refnum">12(a)</tag>
          </tags>
          <graphics candidates="figures/GivenCorrectCross.pdf" class="ltx_centering" graphic="figures/GivenCorrectCross.pdf" options="width=433.62pt" xml:id="A2.F12.sf1.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(a)</tag>Subsequent Verbal Specification performance when given Correct Verbal Classification.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(a)</text></tag><text fontsize="90%">Subsequent Verbal Specification performance when given Correct Verbal Classification.</text></caption>
        </figure>
        <figure align="center" inlist="lof" labels="LABEL:fig:given-correct-within" placement="b" xml:id="A2.F12.sf2">
          <tags>
            <tag><text fontsize="90%">(b)</text></tag>
            <tag role="autoref">(b)</tag>
            <tag role="refnum">12(b)</tag>
          </tags>
<!--  %****␣Appendix.tex␣Line␣200␣**** -->          <graphics candidates="figures/GivenCorrectWithin.pdf" class="ltx_centering" graphic="figures/GivenCorrectWithin.pdf" options="width=433.62pt" xml:id="A2.F12.sf2.g1"/>
          <toccaption class="ltx_centering"><tag close=" ">(b)</tag>Subsequent Visual Extrapolation performance when given Correct Verbal Specification.</toccaption>
          <caption class="ltx_centering"><tag close=" "><text fontsize="90%">(b)</text></tag><text fontsize="90%">Subsequent Visual Extrapolation performance when given Correct Verbal Specification.</text></caption>
        </figure>
        <toccaption class="ltx_centering"><tag close=" ">13</tag><text font="bold">Subsequent performance of models when given correct verbal details</text> in KiVA, sorted by transformation domain.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 13</text></tag><text font="bold" fontsize="90%">Subsequent performance of models when given correct verbal details<text font="medium"> in KiVA, sorted by transformation domain.</text></text></caption>
      </figure>
    </subsection>
    <subsection inlist="toc" labels="LABEL:lastpage LABEL:sec:LVM" xml:id="A2.SS4">
      <tags>
        <tag>B.4</tag>
        <tag role="autoref">subsection B.4</tag>
        <tag role="refnum">B.4</tag>
        <tag role="typerefnum">§B.4</tag>
      </tags>
      <title><tag close=" ">B.4</tag>A Large Vision Model’s Visual Extrapolation Performance on KiVA</title>
      <para xml:id="A2.SS4.p1">
        <p>We further examined whether a large vision model <cite class="ltx_citemacro_citep">(<bibref bibrefs="bai2024sequential" separator=";" show="AuthorsPhrase1Year" yyseparator=",">
              <bibrefphrase>, </bibrefphrase>
            </bibref>)</cite>, trained in the absence of any linguistic data, can solve KiVA. Since the large vision model does not contain text descriptions, we stitch object transformations by adopting the framework described in Section 5.3 of <cite class="ltx_citemacro_cite"><bibref bibrefs="bai2024sequential" separator=";" show="Authors Phrase1YearPhrase2" yyseparator=",">
              <bibrefphrase>(</bibrefphrase>
              <bibrefphrase>)</bibrefphrase>
            </bibref></cite> and prompt the model to generate the missing part in the bottom right corner (see Figure <ref labelref="LABEL:fig:lvm_prompt"/> for an example of the image prompt). The prediction with the lowest perplexity is determined as the model’s answer. Even in the absence of any language to reason about what changed, how it changed, and how to extend the change to a new object, the large vision model can solve some visual analogies (Figure <ref labelref="LABEL:fig:lvm_VE"/>). Interestingly, resembling large multimodal models, the large vision model is more capable of reasoning analogically in terms of color and size than in number and space.</p>
      </para>
      <figure inlist="lof" xml:id="A2.F14">
        <tags>
          <tag><text fontsize="90%">Figure 14</text></tag>
          <tag role="autoref">Figure 14</tag>
          <tag role="refnum">14</tag>
          <tag role="typerefnum">Figure 14</tag>
        </tags>
        <figure align="center" inlist="lof" labels="LABEL:fig:lvm_prompt" placement="t" xml:id="A2.F13.sf1">
          <tags>
            <tag><text fontsize="90%">(a)</text></tag>
            <tag role="autoref">(a)</tag>
            <tag role="refnum">13(a)</tag>
          </tags>
          <graphics candidates="figures/lvm_prompt.pdf" graphic="figures/lvm_prompt.pdf" options="width=433.62pt" xml:id="A2.F13.sf1.g1"/>
          <toccaption><tag close=" ">(a)</tag>Example of a KiVA trial input for the Large Vision Model.</toccaption>
          <caption><tag close=" "><text fontsize="90%">(a)</text></tag><text fontsize="90%">Example of a KiVA trial input for the Large Vision Model.</text></caption>
        </figure>
        <figure align="center" inlist="lof" labels="LABEL:fig:lvm_VE" placement="t" xml:id="A2.F13.sf2">
          <tags>
            <tag><text fontsize="90%">(b)</text></tag>
            <tag role="autoref">(b)</tag>
            <tag role="refnum">13(b)</tag>
          </tags>
          <graphics candidates="figures/LVM.pdf" graphic="figures/LVM.pdf" options="width=433.62pt" xml:id="A2.F13.sf2.g1"/>
          <toccaption><tag close=" ">(b)</tag>Visual Extrapolation Performance of the Large Vision Model across Transformation Domains.</toccaption>
          <caption><tag close=" "><text fontsize="90%">(b)</text></tag><text fontsize="90%">Visual Extrapolation Performance of the Large Vision Model across Transformation Domains.</text></caption>
<!--  %****␣Appendix.tex␣Line␣225␣**** -->        </figure>
        <toccaption class="ltx_centering"><tag close=" ">14</tag>Testing Large Vision Model on KiVA.</toccaption>
        <caption class="ltx_centering"><tag close=": "><text fontsize="90%">Figure 14</text></tag><text fontsize="90%">Testing Large Vision Model on KiVA.</text></caption>
      </figure>
      <para xml:id="A2.SS4.p2">
        <p>Future work may look into the effects of longer visual prompt with more training examples (in-context learning) or further instruction tuning in improving the performance of the large vision model.</p>
      </para>
    </subsection>
  </appendix>
</document>
