Unverified Commit 07bd7e23 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

initialize tokenizer with bos_token (#2781)

parent ebb498e4
......@@ -184,6 +184,7 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
gguf_file=gguf_file,
add_bos_token=add_bos_token,
)
# if we passed `pretrained` as a string, initialize our model now
......@@ -688,6 +689,7 @@ class HFLM(TemplateLM):
trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True,
gguf_file: Optional[str] = None,
add_bos_token: Optional[bool] = False,
) -> None:
"""
Helper method during initialization.
......@@ -706,6 +708,9 @@ class HFLM(TemplateLM):
else:
kwargs["use_fast"] = use_fast_tokenizer
if add_bos_token:
kwargs["add_bos_token"] = True
if tokenizer:
if isinstance(tokenizer, str):
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
......
......@@ -123,6 +123,7 @@ class VLLM(TemplateLM):
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code,
revision=tokenizer_revision,
add_bos_token=add_bos_token,
)
self.tokenizer = configure_pad_token(self.tokenizer)
self.add_bos_token = add_bos_token
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment