Unverified Commit a5e0defb authored by Xuehai Pan's avatar Xuehai Pan Committed by GitHub
Browse files

minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)

parent 96766101
......@@ -3,9 +3,9 @@
This page lists some common errors and tips for fixing them.
## CUDA out of memory
If you see out of memory (OOM) errors, you can try to tune the following parameters.
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
If OOM happens during decoding, try to decrease `--max-running-requests`.
If you see out of memory (OOM) errors, you can try to tune the following parameters.
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
If OOM happens during decoding, try to decrease `--max-running-requests`.
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
## CUDA error: an illegal memory access was encountered
......
......@@ -14,4 +14,4 @@ sphinx-book-theme
sphinx-copybutton
sphinx-tabs
sphinxcontrib-mermaid
urllib3<2.0.0
\ No newline at end of file
urllib3<2.0.0
......@@ -33,7 +33,7 @@ CUR_NODES_IDX=$2
VIDEO_DIR=$3
MODEL_PATH=$4
MODEL_PATH=$4
NUM_FRAMES=$5
......@@ -73,16 +73,16 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
(
START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))
echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
# Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
PORT=$((10000 + RANDOM % 55536))
......@@ -92,7 +92,7 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
#!/bin/bash
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
--port $PORT \
......@@ -102,10 +102,10 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
--video-dir $VIDEO_DIR \
--model-path $MODEL_PATH \
--num-frames $NUM_FRAMES #&
wait $! # Wait for the process to finish and capture its exit status
COMMAND_STATUS=$?
if [ $COMMAND_STATUS -ne 0 ]; then
echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
RETRY_COUNT=$(($RETRY_COUNT + 1))
......@@ -124,8 +124,8 @@ done
wait
cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
END_TIME=$(date +%s) # Capture end time
ELAPSED_TIME=$(($END_TIME - $START_TIME))
echo "Total execution time: $ELAPSED_TIME seconds."
\ No newline at end of file
echo "Total execution time: $ELAPSED_TIME seconds."
......@@ -4,8 +4,8 @@ Usage:
Show in "assistant" the desired answer format. Each "gen" term should have a stop token.
The stream mode is not supported in speculative execution.
E.g.
correct:
E.g.
correct:
sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
incorrect:
s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n"))
......
......@@ -7,4 +7,4 @@ RUN git clone https://github.com/sgl-project/sglang.git
WORKDIR /opt/sglang
RUN pip install --upgrade pip && \
pip install -e "python[all]" && \
pip install datasets
\ No newline at end of file
pip install datasets
......@@ -32,4 +32,4 @@ curl -X POST http://localhost:8000/v2/models/character_generation/generate \
"INPUT_TEXT": ["harry"]
}'
```
\ No newline at end of file
```
......@@ -21,7 +21,7 @@ def main():
# Tokenize inputs
tokenizer = get_tokenizer(MODEL_PATH)
token_ids_list = [tokenizer.encode(prompt) for prompt in prompts]
# Create an LLM.
# You can also specify `skip_tokenizer_init=True`, but it requires explicit detokenization at the end
llm = sgl.Engine(model_path=MODEL_PATH)
......@@ -36,4 +36,4 @@ def main():
# The __main__ condition is necessary here because we use "spawn" to create subprocesses
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -37,4 +37,4 @@ curl -X POST http://localhost:8000/generate -H "Content-Type: application/json"
curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
```
This will send both non-streaming and streaming requests to the server.
\ No newline at end of file
This will send both non-streaming and streaming requests to the server.
......@@ -3,7 +3,7 @@ Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang.
# Endpoint Service CLI:
# Endpoint Service CLI:
python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
python3 http_llama3_llava_test.py
......
......@@ -3,7 +3,7 @@ Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang.
# Endpoint Service CLI:
# Endpoint Service CLI:
python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
python3 http_qwen_llava_test.py
......
......@@ -134,4 +134,4 @@ def method_has_implemented_embedding(
class_embedding = inspect.getattr_static(method_class, "embedding", None)
return (class_embedding is not None
and class_embedding is not base_embedding)
\ No newline at end of file
and class_embedding is not base_embedding)
......@@ -311,7 +311,7 @@ class VocabParallelEmbedding(torch.nn.Module):
def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
"""Get a mapping that can be used to reindex the gathered
logits for sampling.
During sampling, we gather logits from all ranks. The relationship
of index->token_id will follow the same format as outlined in the class
docstring. However, after the gather, we want to reindex the final
......@@ -483,4 +483,4 @@ class ParallelLMHead(VocabParallelEmbedding):
def forward(self, input_):
del input_
raise RuntimeError("LMHead's weights should be used in the sampler.")
\ No newline at end of file
raise RuntimeError("LMHead's weights should be used in the sampler.")
......@@ -838,7 +838,7 @@ class Scheduler:
time_per_output_tokens_iter: List[float] = []
# Request stats
# Decode
# Decode
gen_throughput: float = 0.0
# Latency
time_e2e_requests: List[float] = []
......@@ -866,11 +866,11 @@ class Scheduler:
time_waiting_requests.append(req.queued_time - req.created_time)
num_prompt_tokens_requests.append(len(req.origin_input_ids))
num_generation_tokens_requests.append(len(req.output_ids))
finished_reason_requests.append(
finished_reason_requests.append(
req.finished_reason.to_json()
if req.finished_reason is not None
else None)
return Stats(
new_seq=new_seq,
num_running_req=num_running_req,
......
......@@ -384,7 +384,7 @@ class TokenizerManager:
obj.load_format = self.server_args.load_format
if not self.model_update_lock.locked():
async with self.model_update_lock:
# wait for the previous generation requests to finish
while len(self.rid_to_state) > 0:
......
......@@ -151,7 +151,7 @@ class Metrics:
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5
])
# Request Stats
# Metadata
self.num_prompt_tokens_requests = Histogram(
......@@ -253,7 +253,7 @@ class PrometheusMetricsCollector(MetricsCollector):
stats.time_to_first_tokens_iter)
self._log_histogram(self.metrics.histogram_time_per_output_token,
stats.time_per_output_tokens_iter)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
......@@ -294,4 +294,4 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
buckets.append(value)
else:
return buckets
exponent += 1
\ No newline at end of file
exponent += 1
......@@ -54,4 +54,4 @@ class Stats:
num_prompt_tokens_iter: int = 0
num_generation_tokens_iter: int = 0
time_to_first_tokens_iter: List[float] = field(default_factory=list)
time_per_output_tokens_iter: List[float] = field(default_factory=list)
\ No newline at end of file
time_per_output_tokens_iter: List[float] = field(default_factory=list)
......@@ -17,7 +17,7 @@ limitations under the License.
"""
Utilities for multi-modal models.
This python file mainly contains utilities that were used in the
This python file mainly contains utilities that were used in the
image processing logic of llava-next including operations such as
anyres and anyres_max
......
......@@ -136,7 +136,7 @@ class GPT2Block(nn.Module):
layer_id: int,
config: GPT2Config,
cache_config = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
......@@ -284,4 +284,4 @@ class GPT2LMHeadModel(nn.Module):
default_weight_loader)
weight_loader(param, loaded_weight)
EntryClass = GPT2LMHeadModel
\ No newline at end of file
EntryClass = GPT2LMHeadModel
File mode changed from 100755 to 100644
......@@ -57,27 +57,27 @@ logger = init_logger(__name__)
class Qwen2VLImageInputs(TypedDict):
pixel_values: torch.Tensor
"""Shape:
"""Shape:
`(num_patches, num_channels * patch_size * patch_size)`
"""
image_grid_thw: torch.Tensor
"""Shape: `(num_images, 3)`
This should be in `(grid_t, grid_h, grid_w)` format.
"""
class Qwen2VLVideoInputs(TypedDict):
pixel_values_videos: torch.Tensor
"""Shape:
`(num_patches,
"""Shape:
`(num_patches,
num_channels * temporal_patch_size * patch_size * patch_size)`
"""
video_grid_thw: torch.Tensor
"""Shape: `(num_videos, 3)`
This should be in `(grid_t, grid_h, grid_w)` format.
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment