"""Generate dataset descriptions using an LLM with a single prompt.""" import json import re from datasets_server import DatasetsServerClient from huggingface_hub import InferenceClient DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org" class ViewerNotReadyError(Exception): """Raised when the Datasets Viewer hasn't processed a dataset yet.""" pass def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict: """Gather all dataset information upfront from Datasets Viewer API. Raises: ViewerNotReadyError: If the dataset preview is not available yet. """ client = DatasetsServerClient(token=hf_token) info = {"dataset": dataset} # Get validity and splits try: validity = client.is_valid(dataset) info["validity"] = { "viewer": validity.viewer, "preview": validity.preview, "search": validity.search, "filter": validity.filter, "statistics": validity.statistics, } # Check if preview is ready - we need it to get sample rows if not validity.preview: raise ViewerNotReadyError( f"Dataset viewer not ready for '{dataset}'. " "The dataset may be new or still processing." ) except ViewerNotReadyError: raise # Re-raise our custom exception except Exception as e: info["validity_error"] = str(e) return info # Can't continue without validity # Get splits try: splits = client.list_splits(dataset) info["splits"] = [{"config": s.config, "split": s.split} for s in splits] size = client.get_size(dataset) info["size"] = size.size.get("dataset", {}) if size.size else {} except Exception as e: info["splits_error"] = str(e) # Get features and sample rows if splits: first_split = splits[0] try: preview = client.preview(dataset, first_split.config, first_split.split) info["features"] = preview.features[:10] # Limit features except Exception as e: info["features_error"] = str(e) try: samples = client.sample_rows( dataset, first_split.config, first_split.split, n_samples=15, seed=42, max_requests=10, ) # Truncate long values, tracking truncation rows = [] truncation_occurred = False for row_data in samples.rows: row = row_data.get("row", {}) processed = {} for k, v in row.items(): v_str = str(v) if len(v_str) > 1200: processed[k] = ( v_str[:1200] + f"... [truncated, original {len(v_str)} chars]" ) truncation_occurred = True else: processed[k] = v rows.append(processed) info["sample_rows"] = rows info["samples_truncated"] = truncation_occurred info["num_rows_total"] = samples.num_rows_total except Exception as e: info["samples_error"] = str(e) # Get statistics if available if info.get("validity", {}).get("statistics"): try: first_split = splits[0] stats = client.get_statistics( dataset, first_split.config, first_split.split ) info["statistics"] = stats.statistics # Pass raw stats to model except Exception as e: info["statistics_error"] = str(e) else: info["statistics"] = "Not available for this dataset" return info def build_prompt(dataset_info: dict) -> str: """Build the prompt with all gathered information.""" dataset_id = dataset_info["dataset"] # Format the info nicely info_text = json.dumps(dataset_info, indent=2, default=str) return f"""Write a description for the HuggingFace dataset '{dataset_id}'. Below is information from the Datasets Viewer API: - Dataset metadata (splits, size, features) - A random sample of rows (not the full dataset) - Column statistics (if available) DATASETS VIEWER INFO: {info_text} Requirements: - 2-4 sentences, concise but complete, suitable for a dataset card - Start with "This dataset..." - Include: what the data contains, size, and structure - For text data, mention the language(s) if evident from samples - Mention the likely domain and ML task if reasonably confident - Note any notable patterns in statistics (e.g., class imbalance) - Use hedging ("appears suitable for", "likely") for inferred purposes Important: - Only state facts verifiable from the provided data - Do not guess at licensing, collection methods, or details not shown - The dataset ID may hint at the source or purpose Respond with ONLY the description in tags.""" def generate_description( dataset_id: str, hf_token: str, model: str = DEFAULT_MODEL, ) -> str: """Generate a description for a dataset using LLM. Args: dataset_id: HuggingFace dataset ID (e.g., 'username/dataset') hf_token: HuggingFace token for API access model: Model to use for generation Returns: Generated description string """ # Gather dataset information dataset_info = gather_dataset_info(dataset_id, hf_token) # Build prompt prompt = build_prompt(dataset_info) # Call LLM using InferenceClient client = InferenceClient(token=hf_token) response = client.chat_completion( model=model, messages=[{"role": "user", "content": prompt}], max_tokens=2000, ) final_description = response.choices[0].message.content # Extract description from tags if present if final_description: match = re.search( r"\s*(.*?)\s*", final_description, re.DOTALL ) if match: final_description = match.group(1).strip() return final_description or ""