Spaces:
Sleeping
Sleeping
| { | |
| "input": "https://arxiv.org/abs/2512.19526", | |
| "resolved_source_url": "https://arxiv.org/pdf/2512.19526.pdf", | |
| "analysis": { | |
| "benchmark_name": "QuantiPhy", | |
| "benchmark_aliases": [ | |
| "QUANTI PHY" | |
| ], | |
| "source_type": "paper", | |
| "table_hint": "Table 1", | |
| "likely_metrics": [ | |
| "Mean Relative Accuracy (MRA)" | |
| ], | |
| "search_terms": [ | |
| "quantitative physical reasoning", | |
| "VLM", | |
| "vision-language models", | |
| "kinematic inference", | |
| "object size", | |
| "velocity", | |
| "acceleration", | |
| "2D-Static", | |
| "2D-Dynamic", | |
| "3D-Static", | |
| "3D-Dynamic" | |
| ], | |
| "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results." | |
| }, | |
| "seed_work_openalex_id": "https://openalex.org/W7117138371", | |
| "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models", | |
| "notes": [ | |
| "Fetched source URL content for extraction planning." | |
| ], | |
| "plan_steps": [ | |
| "Extract the seed benchmark leaderboard from the provided source.", | |
| "Find possible newer benchmark results from citing works.", | |
| "Screen citations for benchmark-relevant score tables.", | |
| "Merge, deduplicate, and store leaderboard rows with citations.", | |
| "Generate Gradio app files for Hugging Face deployment." | |
| ] | |
| } |