Instructions to use lucafirefox/Phi-3-small-8k-instruct_handler with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use lucafirefox/Phi-3-small-8k-instruct_handler with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="lucafirefox/Phi-3-small-8k-instruct_handler", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("lucafirefox/Phi-3-small-8k-instruct_handler", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use lucafirefox/Phi-3-small-8k-instruct_handler with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "lucafirefox/Phi-3-small-8k-instruct_handler" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lucafirefox/Phi-3-small-8k-instruct_handler", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/lucafirefox/Phi-3-small-8k-instruct_handler
- SGLang
How to use lucafirefox/Phi-3-small-8k-instruct_handler with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "lucafirefox/Phi-3-small-8k-instruct_handler" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lucafirefox/Phi-3-small-8k-instruct_handler", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "lucafirefox/Phi-3-small-8k-instruct_handler" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lucafirefox/Phi-3-small-8k-instruct_handler", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use lucafirefox/Phi-3-small-8k-instruct_handler with Docker Model Runner:
docker model run hf.co/lucafirefox/Phi-3-small-8k-instruct_handler
| # Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py | |
| import os | |
| from typing import Collection, List, Optional, Dict, Set, Tuple, Union | |
| from functools import cached_property | |
| import base64 | |
| from transformers import PreTrainedTokenizer, AddedToken, AutoConfig | |
| from transformers.models.auto.tokenization_auto import get_tokenizer_config | |
| import tiktoken | |
| """ | |
| This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base") | |
| with a few additional special tokens to support the ChatML format. | |
| TODO(bapatra): Right now, I do not save the special tokens to the vocab file. | |
| Maybe in the future, that would be useful? Can add that support later. | |
| """ | |
| def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: | |
| with open(tiktoken_bpe_file, "rb") as f: | |
| contents = f.read() | |
| return { | |
| base64.b64decode(token): int(rank) | |
| for token, rank in (line.split() for line in contents.splitlines() if line) | |
| } | |
| # On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast. | |
| # this in turn causes some indices to be empty. We account for these empty indices by adding | |
| # dummy tokens to the tokenizer. | |
| EFFECTIVE_PADDED_VOCAB_SIZE = 100352 | |
| ACTUAL_VOCAB_SIZE = 100276 | |
| DUMMY_TOKENS = { | |
| f"<|dummy_id_{11 + offset}|>": 100276 + offset | |
| for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE) | |
| } | |
| SPECIAL_TOKENS = { | |
| # tiktoken.get_encoding("cl100k_base")._special_tokens | |
| '<|endoftext|>': 100257, | |
| '<|fim_prefix|>': 100258, | |
| '<|fim_middle|>': 100259, | |
| '<|fim_suffix|>': 100260, | |
| # Special tokens for post-training | |
| "<|system|>": 100261, | |
| "<|user|>": 100262, | |
| "<|assistant|>": 100263, | |
| # Dummy unused tokens | |
| "<|dummy_id_0|>": 100264, | |
| "<|dummy_id_1|>": 100265, | |
| # Special tokens for post-training continued | |
| "<|end|>": 100266, | |
| # Some dummy tokens, so that tokenization is contiguous and does not cause issues | |
| # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not | |
| # actually map to anything. So we use a dummy token here. | |
| "<|dummy_id_2|>": 100256, | |
| # Likewise, tokens from 100267 to 100275 are also unused | |
| "<|dummy_id_3|>": 100267, | |
| "<|dummy_id_4|>": 100268, | |
| "<|dummy_id_5|>": 100269, | |
| "<|dummy_id_6|>": 100270, | |
| "<|dummy_id_7|>": 100271, | |
| "<|dummy_id_8|>": 100272, | |
| "<|dummy_id_9|>": 100273, | |
| "<|dummy_id_10|>": 100274, | |
| "<|dummy_id_11|>": 100275, | |
| # The final end of prompt token | |
| # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens) | |
| '<|endofprompt|>': 100276, | |
| # Dummy tokens to account for padding of the tokenizer | |
| # We pad to ensure tensor cores are used for vocab multiplication | |
| **DUMMY_TOKENS | |
| } | |
| class Phi3SmallTokenizer(PreTrainedTokenizer): | |
| vocab_files_names = { | |
| "vocab_file": "cl100k_base.tiktoken" | |
| } | |
| model_input_names: List[str] = ["input_ids", "attention_mask"] | |
| padding_side = "left" | |
| def __init__( | |
| self, | |
| vocab_file: Optional[str] = None, | |
| errors: str = "replace", | |
| **kwargs | |
| ) -> None: | |
| # PreTrainedTokenizer's init calls _add_tokens, which in turn checks | |
| # if the token is present in `self.special_tokens``. Hence instantiating it here. | |
| # The way Qwen gets around this is by checking against SPECIAL_TOKENS | |
| # But I think it's better to check against the objects own `special_tokens` | |
| # in case we eventually want to allow the tokenizer to have special tokens. | |
| self.special_tokens = SPECIAL_TOKENS | |
| super().__init__(**kwargs) | |
| self.errors = errors | |
| base = tiktoken.get_encoding("cl100k_base") | |
| if vocab_file is None: | |
| self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks | |
| else: | |
| self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) | |
| self.pat_str = base._pat_str | |
| enc = tiktoken.Encoding( | |
| name="phi3small", | |
| pat_str=self.pat_str, | |
| mergeable_ranks=self.mergeable_ranks, | |
| special_tokens=self.special_tokens, | |
| ) | |
| self.tokenizer = enc | |
| self.decoder: Dict[int, bytes] = { | |
| v: k for k, v in self.mergeable_ranks.items() | |
| } | |
| self.decoder.update({v: k for k, v in self.special_tokens.items()}) | |
| self.eod_id = self.tokenizer.eot_token | |
| self._eos_token = self._convert_id_to_token(self.eod_id) | |
| # Setting the bos_token to be the same as the eos_token | |
| # Note that this is **not** the correct thing to do, and is done | |
| # just so that some of the downstream libraries do not break. | |
| self._bos_token = self._eos_token | |
| # Assign the special tokens to class variables | |
| self.system_id = self.special_tokens["<|system|>"] | |
| self.user_id = self.special_tokens["<|user|>"] | |
| self.assistant_id = self.special_tokens["<|assistant|>"] | |
| self.end_id = self.special_tokens["<|end|>"] | |
| def dummy_token_indices(self) -> List[int]: | |
| # There are some additional special tokens in the cl100k_base tokenizer | |
| # that we do not use. Hence, we also consider them to be dummy tokens. | |
| additional_tokens = [ | |
| "<|fim_prefix|>", | |
| "<|fim_middle|>", | |
| "<|fim_suffix|>", | |
| "<|endofprompt|>" | |
| ] | |
| dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token] | |
| dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens]) | |
| return sorted(dummy_token_indices) | |
| def __getstate__(self): | |
| state = self.__dict__.copy() | |
| del state["tokenizer"] | |
| return state | |
| def __setstate__(self, state): | |
| self.__dict__ = state | |
| enc = tiktoken.Encoding( | |
| name="cl100k_im", | |
| pat_str=self.pat_str, | |
| mergeable_ranks=self.mergeable_ranks, | |
| special_tokens=self.special_tokens, | |
| ) | |
| self.tokenizer = enc | |
| def __len__(self): | |
| return self.tokenizer.n_vocab | |
| def from_pretrained( | |
| cls, | |
| pretrained_model_name_or_path: Union[str, os.PathLike], | |
| *init_inputs, | |
| **kwargs, | |
| ): | |
| cls_kwargs = kwargs | |
| # First try to load from the tokenization config if it exists | |
| tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) | |
| if tokenization_config: | |
| cls_kwargs.update( | |
| dict( | |
| model_max_length=tokenization_config["model_max_length"], | |
| chat_template=tokenization_config.get("chat_template", None) | |
| ) | |
| ) | |
| else: | |
| config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) | |
| cls_kwargs["model_max_length"] = config.max_position_embeddings | |
| return cls(**cls_kwargs) | |
| def get_vocab(self) -> Dict[Union[str, bytes], int]: | |
| return {**self.mergeable_ranks, **self.special_tokens} | |
| def convert_tokens_to_ids( | |
| self, | |
| tokens: Union[bytes, str, List[Union[bytes, str]]] | |
| ) -> Union[int, List[int]]: | |
| ids = [] | |
| if isinstance(tokens, (str, bytes)): | |
| if tokens in self.special_tokens: | |
| return self.special_tokens[tokens] | |
| else: | |
| return self.mergeable_ranks.get(tokens) | |
| ids: List[int] = [] | |
| for token in tokens: | |
| ids.append(self.convert_tokens_to_ids(token)) | |
| return ids | |
| def _add_tokens( | |
| self, | |
| new_tokens: Union[List[str], List[AddedToken]], | |
| special_tokens: bool = False, | |
| ) -> int: | |
| if not special_tokens and new_tokens: | |
| raise ValueError("Only special tokens can be added to this tokenizer") | |
| for token in new_tokens: | |
| surface_form = token.content if isinstance(token, AddedToken) else token | |
| if surface_form not in self.special_tokens: | |
| raise ValueError( | |
| "For now, we do not support unknown special tokens\n" | |
| "In the future, if there is a need for this, we can add special tokens to the tokenizer\n" | |
| "starting from rank 100261 - 100263 and then 100266 - 100275.\n" | |
| "And finally, we can re-construct the enc object back\n" | |
| ) | |
| return 0 | |
| def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: | |
| file_path = os.path.join(save_directory, "cl100k_base.tiktoken") | |
| with open(file_path, "w") as f: | |
| for token, rank in self.mergeable_ranks.items(): | |
| line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n" | |
| f.write(line) | |
| return (file_path,) | |
| def tokenize( | |
| self, | |
| text: str, | |
| allowed_special: Union[Set, str] = "all", | |
| disallowed_special: Union[Collection, str] = (), | |
| **kwargs | |
| ) -> List[Union[bytes, str]]: | |
| tokens: List[Union[bytes, str]] = [] | |
| for token_id in self.tokenizer.encode( | |
| text, allowed_special=allowed_special, disallowed_special=disallowed_special | |
| ): | |
| tokens.append(self.decoder[token_id]) | |
| return tokens | |
| def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str: | |
| """ | |
| Converts a sequence of tokens in a single string. | |
| """ | |
| text = "" | |
| temp = b"" | |
| for t in tokens: | |
| if isinstance(t, str): | |
| if temp: | |
| text += temp.decode("utf-8", errors=self.errors) | |
| temp = b"" | |
| text += t | |
| elif isinstance(t, bytes): | |
| temp += t | |
| else: | |
| raise TypeError("token should only be of type types or str") | |
| if temp: | |
| text += temp.decode("utf-8", errors=self.errors) | |
| return text | |
| def vocab_size(self): | |
| return self.tokenizer.n_vocab | |
| def eos_token_id(self) -> int: | |
| return self.eod_id | |
| def _convert_id_to_token(self, index: int) -> Union[bytes, str]: | |
| """Converts an id to a token, special tokens included""" | |
| if index in self.decoder: | |
| return self.decoder[index] | |
| raise ValueError("unknown ids") | |
| def _convert_token_to_id(self, token: Union[bytes, str]) -> int: | |
| """Converts a token to an id using the vocab, special tokens included""" | |
| if token in self.special_tokens: | |
| return self.special_tokens[token] | |
| if token in self.mergeable_ranks: | |
| return self.mergeable_ranks[token] | |
| raise ValueError("unknown token") | |
| def _tokenize(self, text: str, **kwargs): | |
| """ | |
| Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based | |
| vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). | |
| Do NOT take care of added tokens. | |
| """ | |
| raise NotImplementedError | |
| def _decode( | |
| self, | |
| token_ids: Union[int, List[int]], | |
| skip_special_tokens: bool = False, | |
| errors: str = None, | |
| **kwargs, | |
| ) -> str: | |
| if isinstance(token_ids, int): | |
| token_ids = [token_ids] | |
| if skip_special_tokens: | |
| token_ids = [i for i in token_ids if i < self.eod_id] | |
| return self.tokenizer.decode(token_ids, errors=errors or self.errors) | |