Improve docs and warnings (#1164)

a8ae6403 · Lianmin Zheng · GitHub · d8476818 · a8ae6403 · a8ae6403
Unverified Commit a8ae6403 authored Aug 20, 2024 by Lianmin Zheng Committed by GitHub Aug 20, 2024
7 changed files
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
-<!-- Thank you for your contribution, we really appreciate it. The following instructions will help improve your pull request and make it easier to receive feedback. If there are any items you don't understand, don't worry. Just submit the pull request and ask the maintainers for help. -->
+<!-- Thank you for your contribution! We appreciate it. The following guidelines will help improve your pull request and facilitate feedback. If anything is unclear, don't hesitate to submit your pull request and ask the maintainers for assistance. -->

 ## Motivation

-<!-- Please explain the motivation behind this PR and the goal you aim to achieve with it. -->
+<!-- Explain the purpose of this PR and the goals it aims to achieve. -->

-## Modification
+## Modifications

-<!-- Briefly describe the changes made in this PR. -->
+<!-- Describe the changes made in this PR. -->

 ## Checklist

- [ ] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**.
- [ ] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues.
- [ ] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness.
- [ ] Modify documentation as needed, such as docstrings or example tutorials.
+- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md).
+- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md).
+- [ ] Update documentation as needed, including docstrings or example tutorials.
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -81,14 +81,17 @@ docker run --gpus all \

 ### Method 4: Using docker compose

+<details>
 > This method is recommended if you plan to serve it as a service.
 > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).

 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
 2. Execute the command `docker compose up -d` in your terminal.
+</details>

 ### Method 5: Run on Kubernetes or Clouds with SkyPilot

+<details>
 To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).

 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
@@ -114,8 +117,6 @@ run: |
    --port 30000
 ```

-</details>
-
 ```bash
 # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
 HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
@@ -124,7 +125,7 @@ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
 sky status --endpoint 30000 sglang
 ```
 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
-
+</details>


 ### Common Notes

--- a/python/sglang/srt/hf_transformers_utils.py
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -147,13 +147,12 @@ def get_tokenizer(
        and kwargs.get("use_fast", True)
        and tokenizer_name != _FAST_LLAMA_TOKENIZER
    ):
-        pass
-        # warnings.warn(
-        #    "For some LLaMA V1 models, initializing the fast tokenizer may "
-        #    "take a long time. To reduce the initialization time, consider "
-        #    f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
-        #    "tokenizer."
-        # )
+        warnings.warn(
+            "For some LLaMA V1 models, initializing the fast tokenizer may "
+            "take a long time. To reduce the initialization time, consider "
+            f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
+            "tokenizer."
+        )
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name,

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -270,7 +270,7 @@ class Req:

        if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]:
            # TODO(lsyin): fix token fusion
-            warnings.warn(
+            logging.warning(
                "Token fusion between input and output, try to avoid this by removing the space at the end of the input."
            )
            return False
@@ -791,7 +791,7 @@ class ScheduleBatch:
            )

        if not torch.all(success):
-            warnings.warn("Sampling failed, fallback to top_k=1 strategy")
+            logging.warning("Sampling failed, fallback to top_k=1 strategy")
            probs = probs.masked_fill(torch.isnan(probs), 0.0)
            argmax_ids = torch.argmax(probs, dim=-1)
            batch_next_token_ids = torch.where(

--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -774,7 +774,7 @@ class ModelTpServer:
            torch.cuda.empty_cache()
            logger.info("Cache flushed successfully!")
        else:
-            warnings.warn(
+            logging.warning(
                f"Cache not flushed because there are pending requests. "
                f"#queue-req: {len(self.waiting_queue)}, "
                f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -237,7 +237,7 @@ class ModelRunner:
        self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
        if max_total_tokens is not None:
            if max_total_tokens > self.max_total_num_tokens:
-                warnings.warn(
+                logging.warning(
                    f"max_total_tokens={max_total_tokens} is larger than the profiled value "
                    f"{self.max_total_num_tokens}. "
                    f"Use the profiled value instead."

--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -17,10 +17,10 @@ limitations under the License.

 import asyncio
 import json
+import logging
 import os
 import time
 import uuid
-import warnings
 from http import HTTPStatus
 from typing import Dict, List, Optional

@@ -65,6 +65,8 @@ from sglang.srt.openai_api.protocol import (
    UsageInfo,
 )

+logger = logging.getLogger(__name__)
+
 chat_template_name = None


@@ -408,7 +410,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
                "Parallel sampling is not supported for completions from files"
            )
        if request.echo and request.logprobs:
-            warnings.warn(
+            logger.warning(
                "Echo is not compatible with logprobs. "
                "To compute logprobs of input prompt, please use SGLang /request API."
            )