client.py

import json
import argparse
import requests
import configparser

from typing import Iterable, List


def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
    for chunk in response.iter_lines(chunk_size=1024, decode_unicode=False,
                                     delimiter=b"\0"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"]
            yield output


def get_response(response: requests.Response) -> List[str]:
    data = json.loads(response.content.decode("utf-8"))
    output = data["text"]
    return output


def clear_line(n: int = 1) -> None:
    LINE_UP = '\033[1A'
    LINE_CLEAR = '\x1b[2K'
    for _ in range(n):
        print(LINE_UP, end=LINE_CLEAR, flush=True)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--query', default='请写一首诗')
    parser.add_argument('--use_hf', action='store_true')
    parser.add_argument(
        '--config_path', default='../config.ini', help='config目录')
    args = parser.parse_args()

    print(args.query)
    headers = {"Content-Type": "application/json"}
    data = {
        "query": args.query,
        "history": []
    }

    json_str = json.dumps(data)

    config = configparser.ConfigParser()
    config.read(args.config_path)
    stream_chat = config.getboolean('llm', 'stream_chat')

    func = 'vllm_inference'
    if args.use_hf:
        func = 'hf_inference'
    if stream_chat:
        func = 'vllm_inference_stream'

    api_url = f"http://localhost:8888/{func}"

    if stream_chat:
        headers={
            "Content-Type": "text/event-stream",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive"
        }
        response = requests.post(api_url, headers=headers, data=json_str.encode(
        "utf-8"), verify=False, stream=True)
        num_printed_lines = 0
        for h in get_streaming_response(response):
            # clear_line(num_printed_lines)
            for i, line in enumerate(h):
                num_printed_lines += 1
                print(f"{line!r}", flush=True)

    else:
        headers = {"Content-Type": "application/json"}
        response = requests.post(api_url, headers=headers, data=json_str.encode(
            "utf-8"), verify=False)
        output = get_response(response)
        for i, line in enumerate(output):
            print(f"Beam candidate {i}: {line!r}", flush=True)