"vscode:/vscode.git/clone" did not exist on "f007cd060bc5e05eedd8be2e906dee207bd0979e"
Unverified Commit 1fe554ba authored by Maximilien de Bayser's avatar Maximilien de Bayser Committed by GitHub
Browse files

treat do_lower_case in the same way as the sentence-transformers library (#11815)


Signed-off-by: default avatarMax de Bayser <mbayser@br.ibm.com>
parent 615e4a54
......@@ -35,6 +35,7 @@ class MockModelConfig:
logits_processor_pattern = None
diff_sampling_param: Optional[dict] = None
allowed_local_media_path: str = ""
encoder_config = None
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
......
......@@ -15,6 +15,7 @@ from ..utils import check_embeddings_close
# [Encoder-only]
pytest.param("BAAI/bge-base-en-v1.5",
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-large"),
# [Encoder-decoder]
pytest.param("intfloat/e5-mistral-7b-instruct",
......
......@@ -160,6 +160,11 @@ class OpenAIServing:
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
add_special_tokens: bool,
) -> TextTokensPrompt:
if (self.model_config.encoder_config is not None
and self.model_config.encoder_config.get(
"do_lower_case", False)):
prompt = prompt.lower()
if truncate_prompt_tokens is None:
encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
else:
......
......@@ -190,6 +190,12 @@ class InputPreprocessor:
# on the task and language of their request. Also needed to avoid
# appending an EOS token to the prompt which disrupts generation.
add_special_tokens = False
if (self.model_config.encoder_config is not None
and self.model_config.encoder_config.get(
"do_lower_case", False)):
prompt = prompt.lower()
return tokenizer.encode(request_id=request_id,
prompt=prompt,
lora_request=lora_request,
......
......@@ -26,11 +26,6 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision)
if (model_config.encoder_config is not None
and "do_lower_case" in model_config.encoder_config):
init_kwargs["do_lower_case"] = model_config.encoder_config[
"do_lower_case"]
return get_tokenizer_group(parallel_config.tokenizer_pool_config,
**init_kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment