Unverified Commit 2ce87935 authored by applesaucethebun's avatar applesaucethebun Committed by GitHub
Browse files

Add typo checker in pre-commit (#6179)


Co-authored-by: default avatarBrayden Zhong <b8zhong@uwaterloo.ca>
parent de167cf5
......@@ -33,6 +33,12 @@ repos:
rev: 24.10.0
hooks:
- id: black-jupyter
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'python/pyproject.toml']
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.8
hooks:
......
......@@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes
```bash
#Tuning
#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
#so we can tune decode moe use below command
python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
# and use this command to tune prefill moe
......
......@@ -267,7 +267,7 @@ async def get_requests(
try:
request = await asyncio.wait_for(
input_requests_queue.get(), timeout=300
) # Wait for 5 minites then abort
) # Wait for 5 minutes then abort
except Exception as e:
print(f"exception: {e}")
break
......@@ -514,7 +514,7 @@ async def benchmark(
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
print(
"{:<40} {:<10}".format(
"Max reqeuest concurrency:",
"Max request concurrency:",
max_concurrency if max_concurrency else "not set",
)
)
......
......@@ -95,7 +95,7 @@ def bench_schema(args):
latency = time.time() - tic
# Check if the outputs are valid
indexs = []
indexes = []
for i, state in enumerate(states):
try:
schema = json.loads(arguments[i]["json_schema"])
......@@ -103,7 +103,7 @@ def bench_schema(args):
assert jsonschema.validate(obj, schema) is None
except Exception as e:
print(e)
indexs.append(i)
indexes.append(i)
return states, latency
......
......@@ -15,7 +15,7 @@ from tqdm import tqdm
def generate_lines(random_words, num_lines, redirect_ratio):
prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resolving the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
# Raw lines
visited_indices = set([None])
......
......@@ -17,7 +17,7 @@ ASSISTANT_SUFFIX = " </s><s>"
def multi_document_qa(docs, question, generate):
s = USER_PREFIX
s += "Pleaes answer a question according to given documents.\n"
s += "Please answer a question according to given documents.\n"
s += "Question:" + question + "Documents begin.\n"
s += "".join(docs)
......
......@@ -13,7 +13,7 @@ from sglang.utils import dump_state_text, read_jsonl
@sgl.function
def multi_document_qa(s, docs, question):
s += sgl.user_begin()
s += "Pleaes answer a question according to given documents.\n"
s += "Please answer a question according to given documents.\n"
s += "Question:" + question + "Documents begin.\n"
forks = s.fork(len(docs))
......
......@@ -6,7 +6,7 @@
"source": [
"# Tool and Function Calling\n",
"\n",
"This guide demonstrates how to use SGLang’s [Funcion calling](https://platform.openai.com/docs/guides/function-calling) functionality."
"This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
]
},
{
......@@ -399,7 +399,7 @@
" },\n",
"}\n",
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
"print_highlight(\"==== Reponse ====\")\n",
"print_highlight(\"==== Response ====\")\n",
"print(gen_response)\n",
"\n",
"# parse the response\n",
......
......@@ -275,7 +275,7 @@
"source": [
"## Structured Outputs (JSON, Regex, EBNF)\n",
"\n",
"For OpenAI compatible structed outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
"For OpenAI compatible structured outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
]
},
{
......
......@@ -40,7 +40,7 @@ The `/generate` endpoint accepts the following parameters in JSON format. For de
| Argument | Type/Default | Description |
|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
| frequency_penalty | `float = 0.0` | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. |
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
| min_new_tokens | `int = 0` | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
### Constrained decoding
......
......@@ -166,7 +166,7 @@
"source": [
"## Using Native Generation APIs\n",
"\n",
"You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](sampling_params.md)."
"You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
]
},
{
......
......@@ -378,7 +378,7 @@
"\n",
" Args:\n",
" model_type (str): Type of model to parse reasoning from\n",
" stream_reasoning (bool): If Flase, accumulates reasoning content until complete.\n",
" stream_reasoning (bool): If False, accumulates reasoning content until complete.\n",
" If True, streams reasoning content as it arrives.\n",
" \"\"\"\n",
"\n",
......
......@@ -11,7 +11,7 @@
"\n",
"### Performance Highlights\n",
"\n",
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be archieved via EAGLE3 decoding.\n",
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n",
"For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n",
"\n",
"| Method | Throughput (tokens/s) |\n",
......@@ -296,7 +296,7 @@
"- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n",
"- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n",
"\n",
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionaly to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
"\n",
"\n",
"For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)."
......
......@@ -52,7 +52,7 @@ docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --net
docker exec -it sglang_dev /bin/zsh
```
Some useful volumes to mount are:
1. **Huggingface model cache**: mounting model cache can avoid re-download everytime docker restarts. Default location on Linux is `~/.cache/huggingface/`.
1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
......
......@@ -29,7 +29,7 @@ Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?a
**Notes**
- Do not need to specify the runner group
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be editted later in Github Settings.
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
- Do not need to change the work folder.
### Step 3: Run the runner by `run.sh`
......
......@@ -32,7 +32,7 @@ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-
After the server is ready, you can directly send requests to the router as the same way as sending requests to each single worker.
Please adjust the batchsize accordingly to archieve maximum throughput.
Please adjust the batchsize accordingly to achieve maximum throughput.
```python
import requests
......
......@@ -375,7 +375,7 @@
"\n",
"When opening above experiment, we will see an overview of the experiment as shown below. The upper half shows a summary of the statistics on the left and charts to investigate the distribution and relationships of scores on the right. The lower half is a table with the individual traces which we can use to debug individual samples.\n",
"\n",
"When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrival step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
"When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrieval step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
"\n",
"Note, above link isn't publicly accessible but the experiment can be accessed through [here](https://app.parea.ai/public-experiments/parea/rag_sglang/30f0244a-d56c-44ff-bdfb-8f47626304b6).\n",
"\n",
......
......@@ -147,3 +147,7 @@ exclude = [
"scripts*",
"tests*",
]
[tool.codespell]
ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
skip = "*.json,*.jsonl,*.patch,*.txt"
......@@ -315,7 +315,7 @@ def throughput_test(
tokenizer_id = server_args.tokenizer_path or server_args.model_path
tokenizer = get_tokenizer(tokenizer_id)
# Set global environmnets
# Set global environments
set_ulimit()
random.seed(bench_args.seed)
np.random.seed(bench_args.seed)
......
......@@ -1263,7 +1263,7 @@ async def benchmark(
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
print(
"{:<40} {:<10}".format(
"Max reqeuest concurrency:",
"Max request concurrency:",
max_concurrency if max_concurrency else "not set",
)
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment