Unverified Commit 4cec66e4 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

[API] tokenizer: add trust-remote-code (#2372)



* tokenizer: trust-remote-code

* pre-commit

---------
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
parent aa457edc
...@@ -73,9 +73,12 @@ class TemplateAPI(TemplateLM): ...@@ -73,9 +73,12 @@ class TemplateAPI(TemplateLM):
seed: int = 1234, seed: int = 1234,
max_length: Optional[int] = 2048, max_length: Optional[int] = 2048,
add_bos_token: bool = False, add_bos_token: bool = False,
custom_prefix_token_id=None, custom_prefix_token_id: int = None,
# send the requests as tokens or strings # send the requests as tokens or strings
tokenized_requests=True, tokenized_requests: bool = True,
trust_remote_code: bool = False,
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__() super().__init__()
...@@ -128,7 +131,10 @@ class TemplateAPI(TemplateLM): ...@@ -128,7 +131,10 @@ class TemplateAPI(TemplateLM):
import transformers import transformers
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.tokenizer if self.tokenizer else self.model self.tokenizer if self.tokenizer else self.model,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
) )
# Not used as the API will handle padding but to mirror the behavior of the HFLM # Not used as the API will handle padding but to mirror the behavior of the HFLM
self.tokenizer = configure_pad_token(self.tokenizer) self.tokenizer = configure_pad_token(self.tokenizer)
...@@ -153,6 +159,9 @@ class TemplateAPI(TemplateLM): ...@@ -153,6 +159,9 @@ class TemplateAPI(TemplateLM):
assert isinstance(tokenizer, str), "tokenizer must be a string" assert isinstance(tokenizer, str), "tokenizer must be a string"
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer, tokenizer,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
) )
@abc.abstractmethod @abc.abstractmethod
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment