#!/usr/bin/env python # produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo) import json from transformers import AutoTokenizer from tokenizers import Tokenizer mname = "gpt2" vocab_keep_items = 5000 tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True) assert tokenizer.is_fast, "This only works for fast tokenizers." tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) vocab = tokenizer_json["model"]["vocab"] if tokenizer_json["model"]["type"] == "BPE": if "gpt2" in mname: new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 } new_vocab["<|endoftext|>"] = vocab_keep_items-1 else: new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } merges = tokenizer_json["model"]["merges"] new_merges = [] for i in range(len(merges)): a, b = merges[i].split() new_token = "".join((a, b)) if a in new_vocab and b in new_vocab and new_token in new_vocab: new_merges.append(merges[i]) tokenizer_json["model"]["merges"] = new_merges elif tokenizer_json["model"]["type"] == "Unigram": new_vocab = vocab[:vocab_keep_items] elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel": new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } else: raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}") tokenizer_json["model"]["vocab"] = new_vocab tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) tokenizer.save_pretrained("tiny")