{
  "input": "https://arxiv.org/abs/2512.19526",
  "resolved_source_url": "https://arxiv.org/pdf/2512.19526.pdf",
  "analysis": {
    "benchmark_name": "QuantiPhy",
    "benchmark_aliases": [
      "QUANTI PHY"
    ],
    "source_type": "paper",
    "table_hint": "Table 1",
    "likely_metrics": [
      "Mean Relative Accuracy (MRA)"
    ],
    "search_terms": [
      "quantitative physical reasoning",
      "VLM",
      "vision-language models",
      "kinematic inference",
      "object size",
      "velocity",
      "acceleration",
      "2D-Static",
      "2D-Dynamic",
      "3D-Static",
      "3D-Dynamic"
    ],
    "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
  },
  "seed_work_openalex_id": "https://openalex.org/W7117138371",
  "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",
  "notes": [
    "Fetched source URL content for extraction planning."
  ],
  "plan_steps": [
    "Extract the seed benchmark leaderboard from the provided source.",
    "Find possible newer benchmark results from citing works.",
    "Screen citations for benchmark-relevant score tables.",
    "Merge, deduplicate, and store leaderboard rows with citations.",
    "Generate Gradio app files for Hugging Face deployment."
  ]
}