[API] left truncate for generate_until (#2554)

* left truncate for generate_until * pre-commit

[API] left truncate for generate_until (#2554)
* left truncate for generate_until * pre-commit
2d11f2e5 · Baber Abbasi · GitHub · bcb4cbf4 · 2d11f2e5 · 2d11f2e5
Unverified Commit 2d11f2e5 authored Dec 09, 2024 by Baber Abbasi Committed by GitHub Dec 10, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 6 deletions

README.md README.md +1 -1

lm_eval/models/api_models.py lm_eval/models/api_models.py +42 -2

lm_eval/models/optimum_lm.py lm_eval/models/optimum_lm.py +5 -3

No files found.
--- a/README.md
+++ b/README.md
@@ -209,7 +209,7 @@ Not supported yet: multi-node evaluation and combinations of data replication wi

 Pipeline parallelizm during evaluation is supported with OpenVINO models

-To enable  pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:<GPU index1>,<GPU index2>` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline paralelism of 2 is:
+To enable  pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:<GPU index1>,<GPU index2>` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline parallelism of 2 is:

 ```
 lm_eval --model openvino \

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -448,9 +448,13 @@ class TemplateAPI(TemplateLM):
        for chunk in chunks:
            for cache_key, context_enc, continuation_enc in chunk:
                # max_length - 1 as we always have 1 token for generation
-                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                inp = (context_enc + continuation_enc)[-self.max_length :]
+                if len(inp) < len(context_enc + continuation_enc):
+                    eval_logger.warning(
+                        f"Context length ({len(context_enc)}) + continuation length ({len(continuation_enc)}) > max_length ({self.max_length}). Left truncating context."
+                    )
                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                    0, len(context_enc) + len(continuation_enc) - self.max_length
                )

                inputs.append(inp)
@@ -594,6 +598,24 @@ class TemplateAPI(TemplateLM):
            pbar = tqdm(desc="Requesting API", total=len(requests))
            for chunk in chunked:
                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                if self.tokenized_requests:
+                    max_gen_toks = all_gen_kwargs[0].get(
+                        "max_gen_toks", self._max_gen_toks
+                    )
+                    max_context_len = self.max_length - max_gen_toks
+
+                    encodings_list = [x[-max_context_len:] for x in encodings_list]
+
+                    if any(
+                        len(x) + max_gen_toks > self.max_length for x in encodings_list
+                    ):
+                        eval_logger.warning(
+                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated."
+                        )
+                else:
+                    eval_logger.info(
+                        "Tokenized requests are disabled. Context + generation length is not checked."
+                    )
                req = encodings_list if self.tokenized_requests else contexts
                outputs = retry(
                    stop=stop_after_attempt(self.max_retries),
@@ -625,6 +647,24 @@ class TemplateAPI(TemplateLM):
        else:
            for chunk in chunked:
                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                if self.tokenized_requests:
+                    max_gen_toks = all_gen_kwargs[0].get(
+                        "max_gen_toks", self._max_gen_toks
+                    )
+                    max_context_len = self.max_length - max_gen_toks
+
+                    encodings_list = [x[-max_context_len:] for x in encodings_list]
+
+                    if any(
+                        len(x) + max_gen_toks > self.max_length for x in encodings_list
+                    ):
+                        eval_logger.warning(
+                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated."
+                        )
+                else:
+                    eval_logger.info(
+                        "Tokenized requests are disabled. Context + generation length is not checked."
+                    )
                req = encodings_list if self.tokenized_requests else contexts
                results = itertools.chain.from_iterable(
                    asyncio.run(

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
@@ -71,9 +71,11 @@ class OptimumLM(HFLM):
        else:
            model_kwargs["ov_config"] = {}
        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
-        if 'pipeline_parallel' in model_kwargs:
-            if model_kwargs['pipeline_parallel']:
-                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = "PIPELINE_PARALLEL"
+        if "pipeline_parallel" in model_kwargs:
+            if model_kwargs["pipeline_parallel"]:
+                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
+                    "PIPELINE_PARALLEL"
+                )
        model_file = Path(pretrained) / "openvino_model.xml"
        if model_file.exists():
            export = False