File size: 6,200 Bytes
72431fa
 
 
 
 
 
 
 
 
 
 
763c57d
 
 
 
 
 
72431fa
763c57d
 
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
 
763c57d
 
 
 
 
 
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""Generate dataset descriptions using an LLM with a single prompt."""

import json
import re

from datasets_server import DatasetsServerClient
from huggingface_hub import InferenceClient

DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"


class ViewerNotReadyError(Exception):
    """Raised when the Datasets Viewer hasn't processed a dataset yet."""

    pass


def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
    """Gather all dataset information upfront from Datasets Viewer API.

    Raises:
        ViewerNotReadyError: If the dataset preview is not available yet.
    """
    client = DatasetsServerClient(token=hf_token)

    info = {"dataset": dataset}

    # Get validity and splits
    try:
        validity = client.is_valid(dataset)
        info["validity"] = {
            "viewer": validity.viewer,
            "preview": validity.preview,
            "search": validity.search,
            "filter": validity.filter,
            "statistics": validity.statistics,
        }

        # Check if preview is ready - we need it to get sample rows
        if not validity.preview:
            raise ViewerNotReadyError(
                f"Dataset viewer not ready for '{dataset}'. "
                "The dataset may be new or still processing."
            )
    except ViewerNotReadyError:
        raise  # Re-raise our custom exception
    except Exception as e:
        info["validity_error"] = str(e)
        return info  # Can't continue without validity

    # Get splits
    try:
        splits = client.list_splits(dataset)
        info["splits"] = [{"config": s.config, "split": s.split} for s in splits]

        size = client.get_size(dataset)
        info["size"] = size.size.get("dataset", {}) if size.size else {}
    except Exception as e:
        info["splits_error"] = str(e)

    # Get features and sample rows
    if splits:
        first_split = splits[0]
        try:
            preview = client.preview(dataset, first_split.config, first_split.split)
            info["features"] = preview.features[:10]  # Limit features
        except Exception as e:
            info["features_error"] = str(e)

        try:
            samples = client.sample_rows(
                dataset,
                first_split.config,
                first_split.split,
                n_samples=15,
                seed=42,
                max_requests=10,
            )
            # Truncate long values, tracking truncation
            rows = []
            truncation_occurred = False
            for row_data in samples.rows:
                row = row_data.get("row", {})
                processed = {}
                for k, v in row.items():
                    v_str = str(v)
                    if len(v_str) > 1200:
                        processed[k] = (
                            v_str[:1200]
                            + f"... [truncated, original {len(v_str)} chars]"
                        )
                        truncation_occurred = True
                    else:
                        processed[k] = v
                rows.append(processed)
            info["sample_rows"] = rows
            info["samples_truncated"] = truncation_occurred
            info["num_rows_total"] = samples.num_rows_total
        except Exception as e:
            info["samples_error"] = str(e)

    # Get statistics if available
    if info.get("validity", {}).get("statistics"):
        try:
            first_split = splits[0]
            stats = client.get_statistics(
                dataset, first_split.config, first_split.split
            )
            info["statistics"] = stats.statistics  # Pass raw stats to model
        except Exception as e:
            info["statistics_error"] = str(e)
    else:
        info["statistics"] = "Not available for this dataset"

    return info


def build_prompt(dataset_info: dict) -> str:
    """Build the prompt with all gathered information."""
    dataset_id = dataset_info["dataset"]

    # Format the info nicely
    info_text = json.dumps(dataset_info, indent=2, default=str)

    return f"""Write a description for the HuggingFace dataset '{dataset_id}'.

Below is information from the Datasets Viewer API:
- Dataset metadata (splits, size, features)
- A random sample of rows (not the full dataset)
- Column statistics (if available)

DATASETS VIEWER INFO:
{info_text}

Requirements:
- 2-4 sentences, concise but complete, suitable for a dataset card
- Start with "This dataset..."
- Include: what the data contains, size, and structure
- For text data, mention the language(s) if evident from samples
- Mention the likely domain and ML task if reasonably confident
- Note any notable patterns in statistics (e.g., class imbalance)
- Use hedging ("appears suitable for", "likely") for inferred purposes

Important:
- Only state facts verifiable from the provided data
- Do not guess at licensing, collection methods, or details not shown
- The dataset ID may hint at the source or purpose

Respond with ONLY the description in <description> tags."""


def generate_description(
    dataset_id: str,
    hf_token: str,
    model: str = DEFAULT_MODEL,
) -> str:
    """Generate a description for a dataset using LLM.

    Args:
        dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
        hf_token: HuggingFace token for API access
        model: Model to use for generation

    Returns:
        Generated description string
    """
    # Gather dataset information
    dataset_info = gather_dataset_info(dataset_id, hf_token)

    # Build prompt
    prompt = build_prompt(dataset_info)

    # Call LLM using InferenceClient
    client = InferenceClient(token=hf_token)

    response = client.chat_completion(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=2000,
    )

    final_description = response.choices[0].message.content

    # Extract description from tags if present
    if final_description:
        match = re.search(
            r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL
        )
        if match:
            final_description = match.group(1).strip()

    return final_description or ""