Update infos and codes

2397728d · chenych · 405b3897 · 405b3897 · 405b3897 · 405b3897
Commit 2397728d authored Aug 06, 2024 by chenych
8 changed files
--- a/dtk-cuda.zip
+++ b/dtk-cuda.zip
--- a/faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
+++ b/faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
--- a/gptq_kernels-0.0.0-cp310-cp310-linux_x86_64.whl
+++ b/gptq_kernels-0.0.0-cp310-cp310-linux_x86_64.whl
--- a/llm_service/inferencer.py
+++ b/llm_service/inferencer.py
@@ -279,12 +279,10 @@ def vllm_inference_stream(bind_port, model, tokenizer, sampling_params):
        logger.info("****************** in stream chat ******************")
        response = web.StreamResponse()
        await response.prepare(request)
-        text_outputs = None
+
        async for request_output in results_generator:
-            prompt = request_output.prompt
            text_outputs = [output.text for output in request_output.outputs]
-            ret = {"text": text_outputs}
-            await response.write((json.dumps(ret)+"\0").encode("utf-8"))
+            await response.write((json.dumps({"text": text_outputs})+"\0").encode("utf-8"))
        response.write_eof()
        return response

@@ -293,28 +291,6 @@ def vllm_inference_stream(bind_port, model, tokenizer, sampling_params):
    web.run_app(app, host='0.0.0.0', port=bind_port)


-def infer_test(args):
-    config = configparser.ConfigParser()
-    config.read(args.config_path)
-
-    model_path = config['llm']['local_llm_path']
-    use_vllm = config.getboolean('llm', 'use_vllm')
-    tensor_parallel_size = config.getint('llm', 'tensor_parallel_size')
-    stream_chat = config.getboolean('llm', 'stream_chat')
-    logger.info(f"Get params: model_path {model_path}, use_vllm {use_vllm}, tensor_parallel_size {tensor_parallel_size}, stream_chat {stream_chat}")
-
-    model, tokenizer = init_model(model_path, use_vllm, tensor_parallel_size)
-    llm_infer = LLMInference(model,
-                            tokenizer,
-                            use_vllm=use_vllm)
-
-    time_first = time.time()
-    output_text = llm_infer.chat(args.query)
-    time_second = time.time()
-    logger.debug('问题:{} 回答:{} \ntimecost {} '.format(
-        args.query, output_text, time_second - time_first))
-
-
 def set_envs(dcu_ids):
    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = dcu_ids
@@ -366,7 +342,6 @@ def main():
            vllm_inference(bind_port, model, tokenizer, sampling_params)
    else:
        hf_inference(bind_port, model, tokenizer, stream_chat)
-    # infer_test(args)


 if __name__ == '__main__':

--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,6 @@ textract==1.6.5
 tiktoken==0.7.0
 tenacity==8.3.0
 tokenizers==0.15.2
-transformers==4.38.0
+transformers
 unstructured==0.11.2
 PyMuPDF==1.24.3
\ No newline at end of file
--- a/rocblas-install-0721-ai-nopadding.tar.gz
+++ b/rocblas-install-0721-ai-nopadding.tar.gz
--- a/vllm-0.5.0+das1.1.git0b5e4e1.abi1.dtk2404.torch2.1.0-cp310-cp310-linux_x86_64.whl
+++ b/vllm-0.5.0+das1.1.git0b5e4e1.abi1.dtk2404.torch2.1.0-cp310-cp310-linux_x86_64.whl
--- a/技服-接口文档.docx
+++ b/技服-接口文档.docx