Unverified Commit a5e0defb authored by Xuehai Pan's avatar Xuehai Pan Committed by GitHub
Browse files

minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)

parent 96766101
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
This page lists some common errors and tips for fixing them. This page lists some common errors and tips for fixing them.
## CUDA out of memory ## CUDA out of memory
If you see out of memory (OOM) errors, you can try to tune the following parameters. If you see out of memory (OOM) errors, you can try to tune the following parameters.
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
If OOM happens during decoding, try to decrease `--max-running-requests`. If OOM happens during decoding, try to decrease `--max-running-requests`.
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
## CUDA error: an illegal memory access was encountered ## CUDA error: an illegal memory access was encountered
......
...@@ -14,4 +14,4 @@ sphinx-book-theme ...@@ -14,4 +14,4 @@ sphinx-book-theme
sphinx-copybutton sphinx-copybutton
sphinx-tabs sphinx-tabs
sphinxcontrib-mermaid sphinxcontrib-mermaid
urllib3<2.0.0 urllib3<2.0.0
\ No newline at end of file
...@@ -33,7 +33,7 @@ CUR_NODES_IDX=$2 ...@@ -33,7 +33,7 @@ CUR_NODES_IDX=$2
VIDEO_DIR=$3 VIDEO_DIR=$3
MODEL_PATH=$4 MODEL_PATH=$4
NUM_FRAMES=$5 NUM_FRAMES=$5
...@@ -73,16 +73,16 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do ...@@ -73,16 +73,16 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
( (
START=$(((IDX-1) * GPUS_PER_CHUNK)) START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH}) CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string # Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}") CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX)) LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))
echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR" echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
# Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk. # Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
PORT=$((10000 + RANDOM % 55536)) PORT=$((10000 + RANDOM % 55536))
...@@ -92,7 +92,7 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do ...@@ -92,7 +92,7 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))" echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
#!/bin/bash #!/bin/bash
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \ CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
--port $PORT \ --port $PORT \
...@@ -102,10 +102,10 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do ...@@ -102,10 +102,10 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
--video-dir $VIDEO_DIR \ --video-dir $VIDEO_DIR \
--model-path $MODEL_PATH \ --model-path $MODEL_PATH \
--num-frames $NUM_FRAMES #& --num-frames $NUM_FRAMES #&
wait $! # Wait for the process to finish and capture its exit status wait $! # Wait for the process to finish and capture its exit status
COMMAND_STATUS=$? COMMAND_STATUS=$?
if [ $COMMAND_STATUS -ne 0 ]; then if [ $COMMAND_STATUS -ne 0 ]; then
echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..." echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
RETRY_COUNT=$(($RETRY_COUNT + 1)) RETRY_COUNT=$(($RETRY_COUNT + 1))
...@@ -124,8 +124,8 @@ done ...@@ -124,8 +124,8 @@ done
wait wait
cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
END_TIME=$(date +%s) # Capture end time END_TIME=$(date +%s) # Capture end time
ELAPSED_TIME=$(($END_TIME - $START_TIME)) ELAPSED_TIME=$(($END_TIME - $START_TIME))
echo "Total execution time: $ELAPSED_TIME seconds." echo "Total execution time: $ELAPSED_TIME seconds."
\ No newline at end of file
...@@ -4,8 +4,8 @@ Usage: ...@@ -4,8 +4,8 @@ Usage:
Show in "assistant" the desired answer format. Each "gen" term should have a stop token. Show in "assistant" the desired answer format. Each "gen" term should have a stop token.
The stream mode is not supported in speculative execution. The stream mode is not supported in speculative execution.
E.g. E.g.
correct: correct:
sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n")) sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
incorrect: incorrect:
s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n")) s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n"))
......
...@@ -7,4 +7,4 @@ RUN git clone https://github.com/sgl-project/sglang.git ...@@ -7,4 +7,4 @@ RUN git clone https://github.com/sgl-project/sglang.git
WORKDIR /opt/sglang WORKDIR /opt/sglang
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install -e "python[all]" && \ pip install -e "python[all]" && \
pip install datasets pip install datasets
\ No newline at end of file
...@@ -32,4 +32,4 @@ curl -X POST http://localhost:8000/v2/models/character_generation/generate \ ...@@ -32,4 +32,4 @@ curl -X POST http://localhost:8000/v2/models/character_generation/generate \
"INPUT_TEXT": ["harry"] "INPUT_TEXT": ["harry"]
}' }'
``` ```
\ No newline at end of file
...@@ -21,7 +21,7 @@ def main(): ...@@ -21,7 +21,7 @@ def main():
# Tokenize inputs # Tokenize inputs
tokenizer = get_tokenizer(MODEL_PATH) tokenizer = get_tokenizer(MODEL_PATH)
token_ids_list = [tokenizer.encode(prompt) for prompt in prompts] token_ids_list = [tokenizer.encode(prompt) for prompt in prompts]
# Create an LLM. # Create an LLM.
# You can also specify `skip_tokenizer_init=True`, but it requires explicit detokenization at the end # You can also specify `skip_tokenizer_init=True`, but it requires explicit detokenization at the end
llm = sgl.Engine(model_path=MODEL_PATH) llm = sgl.Engine(model_path=MODEL_PATH)
...@@ -36,4 +36,4 @@ def main(): ...@@ -36,4 +36,4 @@ def main():
# The __main__ condition is necessary here because we use "spawn" to create subprocesses # The __main__ condition is necessary here because we use "spawn" to create subprocesses
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine # Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
...@@ -37,4 +37,4 @@ curl -X POST http://localhost:8000/generate -H "Content-Type: application/json" ...@@ -37,4 +37,4 @@ curl -X POST http://localhost:8000/generate -H "Content-Type: application/json"
curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
``` ```
This will send both non-streaming and streaming requests to the server. This will send both non-streaming and streaming requests to the server.
\ No newline at end of file
...@@ -3,7 +3,7 @@ Usage: ...@@ -3,7 +3,7 @@ Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git # Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang. # Installing latest sglang.
# Endpoint Service CLI: # Endpoint Service CLI:
python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
python3 http_llama3_llava_test.py python3 http_llama3_llava_test.py
......
...@@ -3,7 +3,7 @@ Usage: ...@@ -3,7 +3,7 @@ Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git # Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang. # Installing latest sglang.
# Endpoint Service CLI: # Endpoint Service CLI:
python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
python3 http_qwen_llava_test.py python3 http_qwen_llava_test.py
......
...@@ -134,4 +134,4 @@ def method_has_implemented_embedding( ...@@ -134,4 +134,4 @@ def method_has_implemented_embedding(
class_embedding = inspect.getattr_static(method_class, "embedding", None) class_embedding = inspect.getattr_static(method_class, "embedding", None)
return (class_embedding is not None return (class_embedding is not None
and class_embedding is not base_embedding) and class_embedding is not base_embedding)
\ No newline at end of file
...@@ -311,7 +311,7 @@ class VocabParallelEmbedding(torch.nn.Module): ...@@ -311,7 +311,7 @@ class VocabParallelEmbedding(torch.nn.Module):
def get_sharded_to_full_mapping(self) -> Optional[List[int]]: def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
"""Get a mapping that can be used to reindex the gathered """Get a mapping that can be used to reindex the gathered
logits for sampling. logits for sampling.
During sampling, we gather logits from all ranks. The relationship During sampling, we gather logits from all ranks. The relationship
of index->token_id will follow the same format as outlined in the class of index->token_id will follow the same format as outlined in the class
docstring. However, after the gather, we want to reindex the final docstring. However, after the gather, we want to reindex the final
...@@ -483,4 +483,4 @@ class ParallelLMHead(VocabParallelEmbedding): ...@@ -483,4 +483,4 @@ class ParallelLMHead(VocabParallelEmbedding):
def forward(self, input_): def forward(self, input_):
del input_ del input_
raise RuntimeError("LMHead's weights should be used in the sampler.") raise RuntimeError("LMHead's weights should be used in the sampler.")
\ No newline at end of file
...@@ -838,7 +838,7 @@ class Scheduler: ...@@ -838,7 +838,7 @@ class Scheduler:
time_per_output_tokens_iter: List[float] = [] time_per_output_tokens_iter: List[float] = []
# Request stats # Request stats
# Decode # Decode
gen_throughput: float = 0.0 gen_throughput: float = 0.0
# Latency # Latency
time_e2e_requests: List[float] = [] time_e2e_requests: List[float] = []
...@@ -866,11 +866,11 @@ class Scheduler: ...@@ -866,11 +866,11 @@ class Scheduler:
time_waiting_requests.append(req.queued_time - req.created_time) time_waiting_requests.append(req.queued_time - req.created_time)
num_prompt_tokens_requests.append(len(req.origin_input_ids)) num_prompt_tokens_requests.append(len(req.origin_input_ids))
num_generation_tokens_requests.append(len(req.output_ids)) num_generation_tokens_requests.append(len(req.output_ids))
finished_reason_requests.append( finished_reason_requests.append(
req.finished_reason.to_json() req.finished_reason.to_json()
if req.finished_reason is not None if req.finished_reason is not None
else None) else None)
return Stats( return Stats(
new_seq=new_seq, new_seq=new_seq,
num_running_req=num_running_req, num_running_req=num_running_req,
......
...@@ -384,7 +384,7 @@ class TokenizerManager: ...@@ -384,7 +384,7 @@ class TokenizerManager:
obj.load_format = self.server_args.load_format obj.load_format = self.server_args.load_format
if not self.model_update_lock.locked(): if not self.model_update_lock.locked():
async with self.model_update_lock: async with self.model_update_lock:
# wait for the previous generation requests to finish # wait for the previous generation requests to finish
while len(self.rid_to_state) > 0: while len(self.rid_to_state) > 0:
......
...@@ -151,7 +151,7 @@ class Metrics: ...@@ -151,7 +151,7 @@ class Metrics:
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5 1.0, 2.5
]) ])
# Request Stats # Request Stats
# Metadata # Metadata
self.num_prompt_tokens_requests = Histogram( self.num_prompt_tokens_requests = Histogram(
...@@ -253,7 +253,7 @@ class PrometheusMetricsCollector(MetricsCollector): ...@@ -253,7 +253,7 @@ class PrometheusMetricsCollector(MetricsCollector):
stats.time_to_first_tokens_iter) stats.time_to_first_tokens_iter)
self._log_histogram(self.metrics.histogram_time_per_output_token, self._log_histogram(self.metrics.histogram_time_per_output_token,
stats.time_per_output_tokens_iter) stats.time_per_output_tokens_iter)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys) # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req) self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req) self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
...@@ -294,4 +294,4 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: ...@@ -294,4 +294,4 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
buckets.append(value) buckets.append(value)
else: else:
return buckets return buckets
exponent += 1 exponent += 1
\ No newline at end of file
...@@ -54,4 +54,4 @@ class Stats: ...@@ -54,4 +54,4 @@ class Stats:
num_prompt_tokens_iter: int = 0 num_prompt_tokens_iter: int = 0
num_generation_tokens_iter: int = 0 num_generation_tokens_iter: int = 0
time_to_first_tokens_iter: List[float] = field(default_factory=list) time_to_first_tokens_iter: List[float] = field(default_factory=list)
time_per_output_tokens_iter: List[float] = field(default_factory=list) time_per_output_tokens_iter: List[float] = field(default_factory=list)
\ No newline at end of file
...@@ -17,7 +17,7 @@ limitations under the License. ...@@ -17,7 +17,7 @@ limitations under the License.
""" """
Utilities for multi-modal models. Utilities for multi-modal models.
This python file mainly contains utilities that were used in the This python file mainly contains utilities that were used in the
image processing logic of llava-next including operations such as image processing logic of llava-next including operations such as
anyres and anyres_max anyres and anyres_max
......
...@@ -136,7 +136,7 @@ class GPT2Block(nn.Module): ...@@ -136,7 +136,7 @@ class GPT2Block(nn.Module):
layer_id: int, layer_id: int,
config: GPT2Config, config: GPT2Config,
cache_config = None, cache_config = None,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
prefix: str = "", prefix: str = "",
): ):
...@@ -284,4 +284,4 @@ class GPT2LMHeadModel(nn.Module): ...@@ -284,4 +284,4 @@ class GPT2LMHeadModel(nn.Module):
default_weight_loader) default_weight_loader)
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
EntryClass = GPT2LMHeadModel EntryClass = GPT2LMHeadModel
\ No newline at end of file
File mode changed from 100755 to 100644
...@@ -57,27 +57,27 @@ logger = init_logger(__name__) ...@@ -57,27 +57,27 @@ logger = init_logger(__name__)
class Qwen2VLImageInputs(TypedDict): class Qwen2VLImageInputs(TypedDict):
pixel_values: torch.Tensor pixel_values: torch.Tensor
"""Shape: """Shape:
`(num_patches, num_channels * patch_size * patch_size)` `(num_patches, num_channels * patch_size * patch_size)`
""" """
image_grid_thw: torch.Tensor image_grid_thw: torch.Tensor
"""Shape: `(num_images, 3)` """Shape: `(num_images, 3)`
This should be in `(grid_t, grid_h, grid_w)` format. This should be in `(grid_t, grid_h, grid_w)` format.
""" """
class Qwen2VLVideoInputs(TypedDict): class Qwen2VLVideoInputs(TypedDict):
pixel_values_videos: torch.Tensor pixel_values_videos: torch.Tensor
"""Shape: """Shape:
`(num_patches, `(num_patches,
num_channels * temporal_patch_size * patch_size * patch_size)` num_channels * temporal_patch_size * patch_size * patch_size)`
""" """
video_grid_thw: torch.Tensor video_grid_thw: torch.Tensor
"""Shape: `(num_videos, 3)` """Shape: `(num_videos, 3)`
This should be in `(grid_t, grid_h, grid_w)` format. This should be in `(grid_t, grid_h, grid_w)` format.
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment