Spaces:

Hrant
/

leaderboarder-quantiphy

Sleeping

App Files Files Community

leaderboarder-quantiphy / plan.json

Hrant

Update leaderboard via Leaderboarder

e5082d1 verified 3 months ago

raw

history blame contribute delete

1.77 kB

	{
	"input": "https://arxiv.org/abs/2512.19526",
	"resolved_source_url": "https://arxiv.org/pdf/2512.19526.pdf",
	"analysis": {
	"benchmark_name": "QuantiPhy",
	"benchmark_aliases": [
	"QUANTI PHY"
	],
	"source_type": "paper",
	"table_hint": "Table 1",
	"likely_metrics": [
	"Mean Relative Accuracy (MRA)"
	],
	"search_terms": [
	"quantitative physical reasoning",
	"VLM",
	"vision-language models",
	"kinematic inference",
	"object size",
	"velocity",
	"acceleration",
	"2D-Static",
	"2D-Dynamic",
	"3D-Static",
	"3D-Dynamic"
	],
	"notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
	},
	"seed_work_openalex_id": "https://openalex.org/W7117138371",
	"seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",
	"notes": [
	"Fetched source URL content for extraction planning."
	],
	"plan_steps": [
	"Extract the seed benchmark leaderboard from the provided source.",
	"Find possible newer benchmark results from citing works.",
	"Screen citations for benchmark-relevant score tables.",
	"Merge, deduplicate, and store leaderboard rows with citations.",
	"Generate Gradio app files for Hugging Face deployment."
	]
	}