"cacheflow/vscode:/vscode.git/clone" did not exist on "a283ec2eece57454ec9301e5542cffa1201e175f"
Unverified Commit 78b8015a authored by Bowen Bao's avatar Bowen Bao Committed by GitHub
Browse files

[Bugfix] Relax tokenizer regex for mixtral to include 'tokenizer.model' (#25964)


Signed-off-by: default avatarBowen Bao <bowenbao@amd.com>
parent 831b1241
...@@ -122,15 +122,21 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]: ...@@ -122,15 +122,21 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
def find_tokenizer_file(files: list[str]): def find_tokenizer_file(files: list[str]):
# Accept both versioned (tokenizer.model.v3) and unversioned
# (tokenizer.model) forms, plus tekken.json and tokenizer.mm.model
# variants. Previous pattern only matched the versioned variants.
file_pattern = re.compile( file_pattern = re.compile(
r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") r"^tokenizer\.model(\.v.*)?|tekken\.json|tokenizer\.mm\.model(\.v.*)?$"
)
matched_files = [file for file in files if file_pattern.match(file)] matched_files = [file for file in files if file_pattern.match(file)]
if len(matched_files) > 1: if len(matched_files) > 1:
raise OSError( logger.warning(
f"Found {len(matched_files)} files matching the " "Multiple files matched pattern `%s`: %s. Using %s.",
f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral " file_pattern.pattern,
f"tokenizer is present in {files}.") matched_files,
matched_files[0],
)
elif len(matched_files) == 0: elif len(matched_files) == 0:
raise OSError( raise OSError(
f"Found {len(matched_files)} files matching the " f"Found {len(matched_files)} files matching the "
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment