import openai import time import threading import queue from concurrent.futures import ThreadPoolExecutor, as_completed def jls_extract_def(model, messages, temperature, max_length, stream, index): openai.api_base = "http://127.0.0.1:8100/v1" openai.api_key = "none" output_tokens = 0 ret = "" t0 = time.time() result = openai.ChatCompletion.create(model=model,messages=messages, temperature=temperature, max_length=max_length, stream=stream) for chunk in result: # print(chunk) output_tokens += 1 if hasattr(chunk.choices[0].delta, "content"): if (index == 0): print(chunk.choices[0].delta.content, end="", flush=True) ret += chunk.choices[0].delta.content t1 = time.time() # print("\ntoken/s: {:.2f}, output_tokens: {}".format(output_tokens/(t1-t0),output_tokens)) result = output_tokens, ret, output_tokens/(t1-t0) return result if __name__ == "__main__": prompt = "满江红全文" concurrencys = [1] temperature = 0.1 max_length = 4096 stream = True prompts = [prompt] model="chatglm3-6b-fastllm" messages=[{"role": "user", "content": "你好"}] pool = ThreadPoolExecutor(max_workers=32) for i in range(len(concurrencys)): cur_prompts = prompts * concurrencys[i] token_count = 0 threads = [] t0 = time.time() for index, prompt in enumerate(cur_prompts): messages[0]["content"] = prompt t = pool.submit(jls_extract_def, model, messages, temperature, max_length, stream, index) t.index = index threads.append(t) for future in as_completed(threads): result = future.result() print(future.index) print(result) print("\n") token_count += result[0] t1 = time.time() print("\n---------------------------------------------\n") print("\nconcurrency: {}".format(concurrencys[i])) print("\ntotal use: {:.2f}".format(t1-t0)) print("\ntoken/s: {:.2f}, token_count: {}".format(token_count/(t1-t0),token_count)) print("\n---------------------------------------------\n")