Hrant's picture
Update leaderboard via Leaderboarder
e5082d1 verified
{
"input": "https://arxiv.org/abs/2512.19526",
"resolved_source_url": "https://arxiv.org/pdf/2512.19526.pdf",
"analysis": {
"benchmark_name": "QuantiPhy",
"benchmark_aliases": [
"QUANTI PHY"
],
"source_type": "paper",
"table_hint": "Table 1",
"likely_metrics": [
"Mean Relative Accuracy (MRA)"
],
"search_terms": [
"quantitative physical reasoning",
"VLM",
"vision-language models",
"kinematic inference",
"object size",
"velocity",
"acceleration",
"2D-Static",
"2D-Dynamic",
"3D-Static",
"3D-Dynamic"
],
"notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
},
"seed_work_openalex_id": "https://openalex.org/W7117138371",
"seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",
"notes": [
"Fetched source URL content for extraction planning."
],
"plan_steps": [
"Extract the seed benchmark leaderboard from the provided source.",
"Find possible newer benchmark results from citing works.",
"Screen citations for benchmark-relevant score tables.",
"Merge, deduplicate, and store leaderboard rows with citations.",
"Generate Gradio app files for Hugging Face deployment."
]
}