Spaces:

librarian-bots
/

dataset-card-drafter

Running

File size: 6,200 Bytes

"""Generate dataset descriptions using an LLM with a single prompt."""

import json
import re

from datasets_server import DatasetsServerClient
from huggingface_hub import InferenceClient

DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"


class ViewerNotReadyError(Exception):
    """Raised when the Datasets Viewer hasn't processed a dataset yet."""

    pass


def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
    """Gather all dataset information upfront from Datasets Viewer API.

    Raises:
        ViewerNotReadyError: If the dataset preview is not available yet.
    """
    client = DatasetsServerClient(token=hf_token)

    info = {"dataset": dataset}

    # Get validity and splits
    try:
        validity = client.is_valid(dataset)
        info["validity"] = {
            "viewer": validity.viewer,
            "preview": validity.preview,
            "search": validity.search,
            "filter": validity.filter,
            "statistics": validity.statistics,
        }

        # Check if preview is ready - we need it to get sample rows
        if not validity.preview:
            raise ViewerNotReadyError(
                f"Dataset viewer not ready for '{dataset}'. "
                "The dataset may be new or still processing."
            )
    except ViewerNotReadyError:
        raise  # Re-raise our custom exception
    except Exception as e:
        info["validity_error"] = str(e)
        return info  # Can't continue without validity

    # Get splits
    try:
        splits = client.list_splits(dataset)
        info["splits"] = [{"config": s.config, "split": s.split} for s in splits]

        size = client.get_size(dataset)
        info["size"] = size.size.get("dataset", {}) if size.size else {}
    except Exception as e:
        info["splits_error"] = str(e)

    # Get features and sample rows
    if splits:
        first_split = splits[0]
        try:
            preview = client.preview(dataset, first_split.config, first_split.split)
            info["features"] = preview.features[:10]  # Limit features
        except Exception as e:
            info["features_error"] = str(e)

        try:
            samples = client.sample_rows(
                dataset,
                first_split.config,
                first_split.split,
                n_samples=15,
                seed=42,
                max_requests=10,
            )
            # Truncate long values, tracking truncation
            rows = []
            truncation_occurred = False
            for row_data in samples.rows:
                row = row_data.get("row", {})
                processed = {}
                for k, v in row.items():
                    v_str = str(v)
                    if len(v_str) > 1200:
                        processed[k] = (
                            v_str[:1200]
                            + f"... [truncated, original {len(v_str)} chars]"
                        )
                        truncation_occurred = True
                    else:
                        processed[k] = v
                rows.append(processed)
            info["sample_rows"] = rows
            info["samples_truncated"] = truncation_occurred
            info["num_rows_total"] = samples.num_rows_total
        except Exception as e:
            info["samples_error"] = str(e)

    # Get statistics if available
    if info.get("validity", {}).get("statistics"):
        try:
            first_split = splits[0]
            stats = client.get_statistics(
                dataset, first_split.config, first_split.split
            )
            info["statistics"] = stats.statistics  # Pass raw stats to model
        except Exception as e:
            info["statistics_error"] = str(e)
    else:
        info["statistics"] = "Not available for this dataset"

    return info


def build_prompt(dataset_info: dict) -> str:
    """Build the prompt with all gathered information."""
    dataset_id = dataset_info["dataset"]

    # Format the info nicely
    info_text = json.dumps(dataset_info, indent=2, default=str)

    return f"""Write a description for the HuggingFace dataset '{dataset_id}'.

Below is information from the Datasets Viewer API:
- Dataset metadata (splits, size, features)
- A random sample of rows (not the full dataset)
- Column statistics (if available)

DATASETS VIEWER INFO:
{info_text}

Requirements:
- 2-4 sentences, concise but complete, suitable for a dataset card
- Start with "This dataset..."
- Include: what the data contains, size, and structure
- For text data, mention the language(s) if evident from samples
- Mention the likely domain and ML task if reasonably confident
- Note any notable patterns in statistics (e.g., class imbalance)
- Use hedging ("appears suitable for", "likely") for inferred purposes

Important:
- Only state facts verifiable from the provided data
- Do not guess at licensing, collection methods, or details not shown
- The dataset ID may hint at the source or purpose

Respond with ONLY the description in <description> tags."""


def generate_description(
    dataset_id: str,
    hf_token: str,
    model: str = DEFAULT_MODEL,
) -> str:
    """Generate a description for a dataset using LLM.

    Args:
        dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
        hf_token: HuggingFace token for API access
        model: Model to use for generation

    Returns:
        Generated description string
    """
    # Gather dataset information
    dataset_info = gather_dataset_info(dataset_id, hf_token)

    # Build prompt
    prompt = build_prompt(dataset_info)

    # Call LLM using InferenceClient
    client = InferenceClient(token=hf_token)

    response = client.chat_completion(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=2000,
    )

    final_description = response.choices[0].message.content

    # Extract description from tags if present
    if final_description:
        match = re.search(
            r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL
        )
        if match:
            final_description = match.group(1).strip()

    return final_description or ""