fix auto loading gpt2 tokenizer (#5279)

32cb7449 · Michelle · GitHub · 5d9a0ae7 · 32cb7449
Unverified Commit 32cb7449 authored Jan 18, 2024 by Michelle Committed by GitHub Jan 18, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

applications/ColossalQA/colossalqa/local/llm.py applications/ColossalQA/colossalqa/local/llm.py +13 -0

No files found.
--- a/applications/ColossalQA/colossalqa/local/llm.py
+++ b/applications/ColossalQA/colossalqa/local/llm.py
@@ -136,6 +136,19 @@ class ColossalLLM(LLM):
        """Get the identifying parameters."""
        return {"n": self.n}

+    def get_token_ids(self, text: str) -> List[int]:
+        """Return the ordered ids of the tokens in a text.
+
+        Args:
+            text: The string input to tokenize.
+
+        Returns:
+            A list of ids corresponding to the tokens in the text, in order they occur
+                in the text.
+        """
+        # use the colossal llm's tokenizer instead of langchain's cached GPT2 tokenizer
+        return self.api.tokenizer.encode(text)
+

 class VllmLLM(LLM):
    """