Improve doc strings (#518)

f6dbd240 · Lianmin Zheng · e8a2327d · f6dbd240 · f6dbd240 · f6dbd240
Commit f6dbd240 authored Jun 08, 2024 by Lianmin Zheng
20 changed files
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
 It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.

 The core features include:
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
+- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
+- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).

 ## News
 - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -403,10 +403,10 @@ https://github.com/sgl-project/sglang/issues/157

 ## Citation And Acknowledgment
 ```
-@misc{zheng2023efficiently,
-      title={Efficiently Programming Large Language Models using SGLang},
-      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
-      year={2023},
+@misc{zheng2024sglang,
+      title={SGLang: Efficient Execution of Structured Language Model Programs},
+      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
+      year={2024},
      eprint={2312.07104},
      archivePrefix={arXiv},
      primaryClass={cs.AI}

--- a/python/sglang/api.py
+++ b/python/sglang/api.py
-"""Some Public API Definitions"""
+"""Public APIs of the language."""

 import os
 import re

--- a/python/sglang/launch_server.py
+++ b/python/sglang/launch_server.py
+"""Launch the inference server."""
+
 import argparse

 from sglang.srt.server import ServerArgs, launch_server
@@ -8,4 +10,4 @@ if __name__ == "__main__":
    args = parser.parse_args()
    server_args = ServerArgs.from_cli_args(args)

-    launch_server(server_args, None)
+    launch_server(server_args, None)
\ No newline at end of file
--- a/python/sglang/launch_server_llavavid.py
+++ b/python/sglang/launch_server_llavavid.py
+"""Launch the inference server for Llava-video model."""
 import argparse
 import multiprocessing as mp


--- a/python/sglang/srt/constrained/fsm_cache.py
+++ b/python/sglang/srt/constrained/fsm_cache.py
+"""Cache for the compressed finite state machine."""
 from sglang.srt.constrained import RegexFSM, TransformerTokenizer
 from sglang.srt.constrained.base_cache import BaseCache


--- a/python/sglang/srt/constrained/jump_forward.py
+++ b/python/sglang/srt/constrained/jump_forward.py
+"""
+Faster constrained decoding.
+Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
+"""
 import interegular

 from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm

--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
+"""Conversation templates."""
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 import dataclasses

--- a/python/sglang/srt/flush_cache.py
+++ b/python/sglang/srt/flush_cache.py
 """
+Flush the KV cache.
+
 Usage:
 python3 -m sglang.srt.flush_cache --url http://localhost:30000
 """

--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
+"""Logits processing."""
 import torch
 from torch import nn
 from vllm.distributed import (

--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
+"""Radix attention."""
 import torch
 import numpy as np
 from torch import nn

--- a/python/sglang/srt/managers/controller/dp_worker.py
+++ b/python/sglang/srt/managers/controller/dp_worker.py
 """A data parallel worker thread."""
+
 import asyncio
 import logging
 import queue

--- a/python/sglang/srt/managers/controller/infer_batch.py
+++ b/python/sglang/srt/managers/controller/infer_batch.py
 """Meta data for requests and batches"""
+
 from dataclasses import dataclass
 from enum import IntEnum, auto
 from typing import List

--- a/python/sglang/srt/managers/controller/model_runner.py
+++ b/python/sglang/srt/managers/controller/model_runner.py
+"""ModelRunner runs the forward passes of the models."""
 import importlib
 import importlib.resources
 import logging

--- a/python/sglang/srt/managers/controller/radix_cache.py
+++ b/python/sglang/srt/managers/controller/radix_cache.py
+"""
+The radix tree data structure for managing the KV cache.
+"""
 import heapq
 import time
 from collections import defaultdict

--- a/python/sglang/srt/managers/controller/schedule_heuristic.py
+++ b/python/sglang/srt/managers/controller/schedule_heuristic.py
+"""Request scheduler heuristic."""
 import random
 from collections import defaultdict


--- a/python/sglang/srt/managers/controller/tp_worker.py
+++ b/python/sglang/srt/managers/controller/tp_worker.py
+"""A tensor parallel worker."""
+
 import asyncio
 import logging
 import time

--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
+"""DetokenizerManager is a process that detokenizes the token ids."""
 import asyncio
 import inspect


--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
+"""
+The definition of objects transfered between different
+processes (TokenizerManager, DetokenizerManager, Controller).
+"""
+
 import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
+"""TokenizerManager is a process that tokenizes the text."""
 import asyncio
 import concurrent.futures
 import dataclasses
@@ -283,7 +284,7 @@ class TokenizerManager:
        req = AbortReq(rid)
        self.send_to_router.send_pyobj(req)

-    def create_abort_task(self, obj):
+    def create_abort_task(self, obj: GenerateReqInput):
        # Abort the request if the client is disconnected.
        async def abort_request():
            await asyncio.sleep(3)

--- a/python/sglang/srt/openai_protocol.py
+++ b/python/sglang/srt/openai_protocol.py
-"""pydantic models for OpenAI API protocol"""
+"""Pydantic models for OpenAI API protocol"""

 import time
 from typing import Dict, List, Optional, Union