import json import argparse import requests import configparser from typing import Iterable, List def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: for chunk in response.iter_lines(chunk_size=1024, decode_unicode=False, delimiter=b"\0"): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"] yield output def get_response(response: requests.Response) -> List[str]: data = json.loads(response.content.decode("utf-8")) output = data["text"] return output def clear_line(n: int = 1) -> None: LINE_UP = '\033[1A' LINE_CLEAR = '\x1b[2K' for _ in range(n): print(LINE_UP, end=LINE_CLEAR, flush=True) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--query', default='请写一首诗') parser.add_argument('--use_hf', action='store_true') parser.add_argument( '--config_path', default='../config.ini', help='config目录') args = parser.parse_args() print(args.query) headers = {"Content-Type": "application/json"} data = { "query": args.query, "history": [] } json_str = json.dumps(data) config = configparser.ConfigParser() config.read(args.config_path) stream_chat = config.getboolean('llm', 'stream_chat') func = 'vllm_inference' if args.use_hf: func = 'hf_inference' if stream_chat: func = 'vllm_inference_stream' api_url = f"http://localhost:8888/{func}" if stream_chat: response = requests.get(api_url, headers=headers, data=json_str.encode( "utf-8"), verify=False, stream=stream_chat) num_printed_lines = 0 for h in get_streaming_response(response): clear_line(num_printed_lines) num_printed_lines = 0 for i, line in enumerate(h): num_printed_lines += 1 print(f"Beam candidate {i}: {line!r}", flush=True) else: response = requests.get(api_url, headers=headers, data=json_str.encode( "utf-8"), verify=False, stream=stream_chat) output = get_response(response) for i, line in enumerate(output): print(f"Beam candidate {i}: {line!r}", flush=True)