HonestAI

Paused

JatsTheAIGen commited on Nov 4

Commit

13fa6c4

1 Parent(s): 9959ea9

Fix: BitsAndBytes compatibility and error handling

CRITICAL FIXES:
- Fixed BitsAndBytes kernel registration error handling
- Distinguish between bitsandbytes errors and gated repository errors
- Automatic fallback to loading without quantization if bitsandbytes fails
- Changed fallback model to microsoft/Phi-3-mini-4k-instruct (verified non-gated)

Changes:
- src/local_model_loader.py:
- Better error detection for bitsandbytes vs gated repo errors
- Automatic fallback to no quantization if bitsandbytes fails
- Improved error messages to distinguish error types

- src/llm_router.py:
- Added catch for bitsandbytes errors at router level
- Automatic retry without quantization on bitsandbytes failures

- src/models_config.py:
- Changed fallback from mistralai/Mistral-7B-Instruct-v0.2 to microsoft/Phi-3-mini-4k-instruct
- Phi-3-mini is verified non-gated and smaller (3.8B vs 7B)

Fixes:
- RuntimeError: int8_mm_dequant kernel registration conflict
- ModuleNotFoundError: validate_bnb_backend_availability
- False positive 'gated repository' errors from bitsandbytes failures

Now properly handles:
- BitsAndBytes compatibility issues → fallback to no quantization
- Actual gated repository errors → use fallback model
- Both errors → clear error messages

Files changed (3) hide show

src/llm_router.py +24 -0
src/local_model_loader.py +103 -44
src/models_config.py +3 -3

src/llm_router.py CHANGED Viewed

@@ -169,6 +169,30 @@ class LLMRouter:
                             raise
                     else:
                         raise RuntimeError(f"Model {model_id} is a gated repository and no fallback available") from e
             # Format as chat messages if needed
             messages = [{"role": "user", "content": prompt}]

                             raise
                     else:
                         raise RuntimeError(f"Model {model_id} is a gated repository and no fallback available") from e
+                except (RuntimeError, ModuleNotFoundError, ImportError) as e:
+                    # Check if this is a bitsandbytes error (not a gated repo error)
+                    error_str = str(e).lower()
+                    if "bitsandbytes" in error_str or "int8_mm_dequant" in error_str or "validate_bnb_backend" in error_str:
+                        logger.warning(f"⚠ BitsAndBytes compatibility issue detected: {e}")
+                        logger.warning(f"⚠ Model {model_id} will be loaded without quantization")
+                        # Retry without quantization
+                        try:
+                            # Disable quantization for this attempt
+                            fallback_config = model_config.copy()
+                            fallback_config["use_4bit_quantization"] = False
+                            fallback_config["use_8bit_quantization"] = False
+                            return await self._call_local_model(
+                                fallback_config,
+                                prompt,
+                                task_type,
+                                **kwargs
+                            )
+                        except Exception as retry_error:
+                            logger.error(f"Failed to load model even without quantization: {retry_error}")
+                            raise RuntimeError(f"Model loading failed: {retry_error}") from retry_error
+                    else:
+                        # Not a bitsandbytes error, re-raise
+                        raise
             # Format as chat messages if needed
             messages = [{"role": "user", "content": prompt}]

src/local_model_loader.py CHANGED Viewed

@@ -110,6 +110,7 @@ class LocalModelLoader:
                 logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
             # Load tokenizer with cache directory
             try:
                 tokenizer = AutoTokenizer.from_pretrained(
                     base_model_id,
@@ -117,15 +118,27 @@ class LocalModelLoader:
                     token=self.hf_token if self.hf_token else None,
                     trust_remote_code=True
                 )
-            except GatedRepoError as e:
-                logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
-                logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
-                logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
-                logger.error(f"   Error details: {e}")
-                raise GatedRepoError(
-                    f"Cannot access gated repository {base_model_id}. "
-                    f"Visit https://huggingface.co/{base_model_id} to request access."
-                ) from e
             # Determine quantization config
             if load_in_4bit and self.device == "cuda":
@@ -151,46 +164,92 @@ class LocalModelLoader:
                 quantization_config = None
             # Load model with GPU optimization and cache directory
-            try:
-                load_kwargs = {
-                    "cache_dir": self.cache_dir,
-                    "token": self.hf_token if self.hf_token else None,
-                    "trust_remote_code": True
-                }
-                if self.device == "cuda":
-                    load_kwargs.update({
-                        "device_map": "auto",  # Automatically uses GPU
-                        "torch_dtype": torch.float16,  # Use FP16 for memory efficiency
-                    })
-                    if quantization_config:
-                        if isinstance(quantization_config, dict):
-                            load_kwargs.update(quantization_config)
-                        else:
-                            load_kwargs["quantization_config"] = quantization_config
                     model = AutoModelForCausalLM.from_pretrained(
                         base_model_id,
                         **load_kwargs
                     )
-                else:
-                    load_kwargs.update({
-                        "torch_dtype": torch.float32,
-                    })
-                    model = AutoModelForCausalLM.from_pretrained(
-                        base_model_id,
-                        **load_kwargs
-                    )
-                    model = model.to(self.device)
-            except GatedRepoError as e:
-                logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
-                logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
-                logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
-                logger.error(f"   Error details: {e}")
-                raise GatedRepoError(
-                    f"Cannot access gated repository {base_model_id}. "
-                    f"Visit https://huggingface.co/{base_model_id} to request access."
-                ) from e
             # Ensure padding token is set
             if tokenizer.pad_token is None:

                 logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
             # Load tokenizer with cache directory
+            # This will fail with actual GatedRepoError if model is gated
             try:
                 tokenizer = AutoTokenizer.from_pretrained(
                     base_model_id,
                     token=self.hf_token if self.hf_token else None,
                     trust_remote_code=True
                 )
+            except Exception as e:
+                # Check if this is actually a gated repo error
+                error_str = str(e).lower()
+                if "gated" in error_str or "authorized" in error_str or "access" in error_str:
+                    # This might be a gated repo error
+                    try:
+                        from huggingface_hub.exceptions import GatedRepoError as RealGatedRepoError
+                        if isinstance(e, RealGatedRepoError):
+                            logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
+                            logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
+                            logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
+                            logger.error(f"   Error details: {e}")
+                            raise RealGatedRepoError(
+                                f"Cannot access gated repository {base_model_id}. "
+                                f"Visit https://huggingface.co/{base_model_id} to request access."
+                            ) from e
+                    except ImportError:
+                        pass
+                # If it's not a gated repo error, re-raise as-is
+                raise
             # Determine quantization config
             if load_in_4bit and self.device == "cuda":
                 quantization_config = None
             # Load model with GPU optimization and cache directory
+            # Try with quantization first, fallback to no quantization if bitsandbytes fails
+            load_kwargs = {
+                "cache_dir": self.cache_dir,
+                "token": self.hf_token if self.hf_token else None,
+                "trust_remote_code": True
+            }
+            if self.device == "cuda":
+                load_kwargs.update({
+                    "device_map": "auto",  # Automatically uses GPU
+                    "torch_dtype": torch.float16,  # Use FP16 for memory efficiency
+                })
+            # Try loading with quantization first
+            model = None
+            quantization_failed = False
+            if quantization_config and self.device == "cuda":
+                try:
+                    if isinstance(quantization_config, dict):
+                        load_kwargs.update(quantization_config)
+                    else:
+                        load_kwargs["quantization_config"] = quantization_config
                     model = AutoModelForCausalLM.from_pretrained(
                         base_model_id,
                         **load_kwargs
                     )
+                    logger.info("✓ Model loaded with quantization")
+                except (RuntimeError, ModuleNotFoundError, ImportError) as e:
+                    error_str = str(e).lower()
+                    # Check if error is related to bitsandbytes
+                    if "bitsandbytes" in error_str or "int8_mm_dequant" in error_str or "validate_bnb_backend" in error_str:
+                        logger.warning(f"⚠ BitsAndBytes error detected: {e}")
+                        logger.warning("⚠ Falling back to loading without quantization")
+                        quantization_failed = True
+                        # Remove quantization config and retry
+                        load_kwargs.pop("quantization_config", None)
+                        load_kwargs.pop("load_in_8bit", None)
+                        load_kwargs.pop("load_in_4bit", None)
+                    else:
+                        # Re-raise if it's not a bitsandbytes error
+                        raise
+            # If quantization failed or not using quantization, load without it
+            if model is None:
+                try:
+                    if self.device == "cuda":
+                        model = AutoModelForCausalLM.from_pretrained(
+                            base_model_id,
+                            **load_kwargs
+                        )
+                    else:
+                        load_kwargs.update({
+                            "torch_dtype": torch.float32,
+                        })
+                        model = AutoModelForCausalLM.from_pretrained(
+                            base_model_id,
+                            **load_kwargs
+                        )
+                        model = model.to(self.device)
+                except Exception as e:
+                    # Check if this is a gated repo error (not bitsandbytes)
+                    error_str = str(e).lower()
+                    if "bitsandbytes" in error_str or "int8_mm_dequant" in error_str:
+                        # BitsAndBytes error - should have been caught earlier
+                        logger.error(f"❌ Unexpected BitsAndBytes error: {e}")
+                        raise RuntimeError(f"BitsAndBytes compatibility issue: {e}") from e
+                    # Check for actual gated repo error
+                    try:
+                        from huggingface_hub.exceptions import GatedRepoError as RealGatedRepoError
+                        if isinstance(e, RealGatedRepoError) or "gated" in error_str or "authorized" in error_str:
+                            logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
+                            logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
+                            logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
+                            logger.error(f"   Error details: {e}")
+                            raise RealGatedRepoError(
+                                f"Cannot access gated repository {base_model_id}. "
+                                f"Visit https://huggingface.co/{base_model_id} to request access."
+                            ) from e
+                    except ImportError:
+                        pass
+                    # Re-raise other errors as-is
+                    raise
             # Ensure padding token is set
             if tokenizer.pad_token is None:

src/models_config.py CHANGED Viewed

@@ -9,7 +9,7 @@ LLM_CONFIG = {
             "task": "general_reasoning",
             "max_tokens": 8000,  # Reduced from 10000
             "temperature": 0.7,
-            "fallback": "mistralai/Mistral-7B-Instruct-v0.2",  # Non-gated fallback model
             "is_chat_model": True,
             "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
             "use_8bit_quantization": False
@@ -29,7 +29,7 @@ LLM_CONFIG = {
             "latency_target": "<100ms",
             "is_chat_model": True,
             "use_4bit_quantization": True,
-            "fallback": "mistralai/Mistral-7B-Instruct-v0.2"  # Non-gated fallback
         },
         "safety_checker": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
@@ -38,7 +38,7 @@ LLM_CONFIG = {
             "purpose": "bias_detection",
             "is_chat_model": True,
             "use_4bit_quantization": True,
-            "fallback": "mistralai/Mistral-7B-Instruct-v0.2"  # Non-gated fallback
         }
     },
     "routing_logic": {

             "task": "general_reasoning",
             "max_tokens": 8000,  # Reduced from 10000
             "temperature": 0.7,
+            "fallback": "microsoft/Phi-3-mini-4k-instruct",  # Non-gated fallback model (3.8B, verified non-gated)
             "is_chat_model": True,
             "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
             "use_8bit_quantization": False
             "latency_target": "<100ms",
             "is_chat_model": True,
             "use_4bit_quantization": True,
+            "fallback": "microsoft/Phi-3-mini-4k-instruct"  # Non-gated fallback (3.8B, verified non-gated)
         },
         "safety_checker": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
             "purpose": "bias_detection",
             "is_chat_model": True,
             "use_4bit_quantization": True,
+            "fallback": "microsoft/Phi-3-mini-4k-instruct"  # Non-gated fallback (3.8B, verified non-gated)
         }
     },
     "routing_logic": {