"docs/en/notes/compatibility.md" did not exist on "19f7b65a7d2102c44d686f8d7c0d97927cc56967"
Commit c2a07d9a authored by Baber's avatar Baber
Browse files

add docs

parent 5c6e9b50
......@@ -327,7 +327,7 @@ class TemplateLM(LM):
@property
@abc.abstractmethod
def eot_token_id(self):
def eot_token_id(self) -> int:
pass
@property
......@@ -351,6 +351,32 @@ class TemplateLM(LM):
def _encode_pair(
self, context: str, continuation: str
) -> tuple[list[int], list[int]]:
"""
Encode a context-continuation pair into separate token ID lists.
This method handles the tokenization of context and continuation strings while
preserving proper boundary handling. Trailing spaces in the context are moved
to the beginning of the continuation to ensure correct tokenization at the
word boundary.
For Seq2Seq models (encoder-decoder), context and continuation are encoded
separately. For other model types (decoder-only), the full sequence is encoded
together to ensure proper tokenization, then split at the context boundary.
:param context: str
The context string. Can be empty (will be handled by the caller).
:param continuation: str
The continuation string to be scored.
:return: tuple[list[int], list[int]]
A tuple of (context_enc, continuation_enc) where:
- context_enc: Token IDs for the context
- continuation_enc: Token IDs for the continuation
Note:
This method does NOT handle empty context. The caller should
handle empty context (see loglikelihood method).
"""
import transformers
n_spaces = len(context) - len(context.rstrip())
......@@ -375,6 +401,31 @@ class TemplateLM(LM):
def loglikelihood(
self, requests: list["Instance"], disable_tqdm: bool = False
) -> list[tuple[float, bool]]:
"""
Compute log-likelihood of generating continuations from contexts.
This is the concrete implementation for TemplateLM and its subclasses.
It tokenizes context-continuation pairs and delegates scoring to
_loglikelihood_tokens.
**IMPORTANT**: This method is expected to handle empty context strings.
When context is empty (""), it uses the model's prefix_token_id (typically
BOS or EOS token) as context. If the continuation already starts with the
prefix token, it reuses that token as context instead of duplicating it.
:param requests: list[Instance]
List of Instance objects with property `args` returning (context, continuation) tuples.
:param disable_tqdm: bool
Whether to disable the progress bar in _loglikelihood_tokens.
:return: list[tuple[float, bool]]
List of (log_prob, is_greedy) tuples for each request.
Implementation details:
- Empty context: Uses prefix_token_id (BOS/EOS) as context
- Non-empty context: Uses _encode_pair for proper tokenization
- Avoids token duplication when continuation starts with prefix_token_id
"""
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment