{ "input": "https://arxiv.org/abs/2512.19526", "resolved_source_url": "https://arxiv.org/pdf/2512.19526.pdf", "analysis": { "benchmark_name": "QuantiPhy", "benchmark_aliases": [ "QUANTI PHY" ], "source_type": "paper", "table_hint": "Table 1", "likely_metrics": [ "Mean Relative Accuracy (MRA)" ], "search_terms": [ "quantitative physical reasoning", "VLM", "vision-language models", "kinematic inference", "object size", "velocity", "acceleration", "2D-Static", "2D-Dynamic", "3D-Static", "3D-Dynamic" ], "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results." }, "seed_work_openalex_id": "https://openalex.org/W7117138371", "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models", "notes": [ "Fetched source URL content for extraction planning." ], "plan_steps": [ "Extract the seed benchmark leaderboard from the provided source.", "Find possible newer benchmark results from citing works.", "Screen citations for benchmark-relevant score tables.", "Merge, deduplicate, and store leaderboard rows with citations.", "Generate Gradio app files for Hugging Face deployment." ] }