File size: 6,200 Bytes
72431fa 763c57d 72431fa 763c57d 72431fa 763c57d 72431fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
"""Generate dataset descriptions using an LLM with a single prompt."""
import json
import re
from datasets_server import DatasetsServerClient
from huggingface_hub import InferenceClient
DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"
class ViewerNotReadyError(Exception):
"""Raised when the Datasets Viewer hasn't processed a dataset yet."""
pass
def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
"""Gather all dataset information upfront from Datasets Viewer API.
Raises:
ViewerNotReadyError: If the dataset preview is not available yet.
"""
client = DatasetsServerClient(token=hf_token)
info = {"dataset": dataset}
# Get validity and splits
try:
validity = client.is_valid(dataset)
info["validity"] = {
"viewer": validity.viewer,
"preview": validity.preview,
"search": validity.search,
"filter": validity.filter,
"statistics": validity.statistics,
}
# Check if preview is ready - we need it to get sample rows
if not validity.preview:
raise ViewerNotReadyError(
f"Dataset viewer not ready for '{dataset}'. "
"The dataset may be new or still processing."
)
except ViewerNotReadyError:
raise # Re-raise our custom exception
except Exception as e:
info["validity_error"] = str(e)
return info # Can't continue without validity
# Get splits
try:
splits = client.list_splits(dataset)
info["splits"] = [{"config": s.config, "split": s.split} for s in splits]
size = client.get_size(dataset)
info["size"] = size.size.get("dataset", {}) if size.size else {}
except Exception as e:
info["splits_error"] = str(e)
# Get features and sample rows
if splits:
first_split = splits[0]
try:
preview = client.preview(dataset, first_split.config, first_split.split)
info["features"] = preview.features[:10] # Limit features
except Exception as e:
info["features_error"] = str(e)
try:
samples = client.sample_rows(
dataset,
first_split.config,
first_split.split,
n_samples=15,
seed=42,
max_requests=10,
)
# Truncate long values, tracking truncation
rows = []
truncation_occurred = False
for row_data in samples.rows:
row = row_data.get("row", {})
processed = {}
for k, v in row.items():
v_str = str(v)
if len(v_str) > 1200:
processed[k] = (
v_str[:1200]
+ f"... [truncated, original {len(v_str)} chars]"
)
truncation_occurred = True
else:
processed[k] = v
rows.append(processed)
info["sample_rows"] = rows
info["samples_truncated"] = truncation_occurred
info["num_rows_total"] = samples.num_rows_total
except Exception as e:
info["samples_error"] = str(e)
# Get statistics if available
if info.get("validity", {}).get("statistics"):
try:
first_split = splits[0]
stats = client.get_statistics(
dataset, first_split.config, first_split.split
)
info["statistics"] = stats.statistics # Pass raw stats to model
except Exception as e:
info["statistics_error"] = str(e)
else:
info["statistics"] = "Not available for this dataset"
return info
def build_prompt(dataset_info: dict) -> str:
"""Build the prompt with all gathered information."""
dataset_id = dataset_info["dataset"]
# Format the info nicely
info_text = json.dumps(dataset_info, indent=2, default=str)
return f"""Write a description for the HuggingFace dataset '{dataset_id}'.
Below is information from the Datasets Viewer API:
- Dataset metadata (splits, size, features)
- A random sample of rows (not the full dataset)
- Column statistics (if available)
DATASETS VIEWER INFO:
{info_text}
Requirements:
- 2-4 sentences, concise but complete, suitable for a dataset card
- Start with "This dataset..."
- Include: what the data contains, size, and structure
- For text data, mention the language(s) if evident from samples
- Mention the likely domain and ML task if reasonably confident
- Note any notable patterns in statistics (e.g., class imbalance)
- Use hedging ("appears suitable for", "likely") for inferred purposes
Important:
- Only state facts verifiable from the provided data
- Do not guess at licensing, collection methods, or details not shown
- The dataset ID may hint at the source or purpose
Respond with ONLY the description in <description> tags."""
def generate_description(
dataset_id: str,
hf_token: str,
model: str = DEFAULT_MODEL,
) -> str:
"""Generate a description for a dataset using LLM.
Args:
dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
hf_token: HuggingFace token for API access
model: Model to use for generation
Returns:
Generated description string
"""
# Gather dataset information
dataset_info = gather_dataset_info(dataset_id, hf_token)
# Build prompt
prompt = build_prompt(dataset_info)
# Call LLM using InferenceClient
client = InferenceClient(token=hf_token)
response = client.chat_completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=2000,
)
final_description = response.choices[0].message.content
# Extract description from tags if present
if final_description:
match = re.search(
r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL
)
if match:
final_description = match.group(1).strip()
return final_description or ""
|