first commit

efadc3bc · zzg_666 · efadc3bc · efadc3bc · efadc3bc · efadc3bc
Commit efadc3bc authored Dec 15, 2025 by zzg_666
9 changed files
--- a/.nfs000000005d9996bb0000004b
+++ b/.nfs000000005d9996bb0000004b
+nohup: ignoring input
+INFO 12-15 14:08:06 [__init__.py:245] Automatically detected platform rocm.
+INFO 12-15 14:08:09 [api_server.py:1395] vLLM API server version 0.9.2
+INFO 12-15 14:08:09 [cli_args.py:325] non-default args: {'model': '../OctoMed/OctoMed-7B/', 'trust_remote_code': True, 'dtype': 'bfloat16', 'max_model_len': 32768, 'max_seq_len_to_capture': 32768}
+INFO 12-15 14:08:16 [config.py:850] This model supports multiple tasks: {'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
+`torch_dtype` is deprecated! Use `dtype` instead!
+INFO 12-15 14:08:16 [config.py:1488] Using max model len 32768
+INFO 12-15 14:08:16 [config.py:2301] Chunked prefill is enabled with max_num_batched_tokens=2048.
+INFO 12-15 14:08:20 [__init__.py:245] Automatically detected platform rocm.
+INFO 12-15 14:08:22 [core.py:529] Waiting for init message from front-end.
+INFO 12-15 14:08:22 [core.py:71] Initializing a V1 LLM engine (v0.9.2) with config: model='../OctoMed/OctoMed-7B/', speculative_config=None, tokenizer='../OctoMed/OctoMed-7B/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=../OctoMed/OctoMed-7B/, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":null}
+WARNING 12-15 14:08:22 [worker_base.py:42] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 12-15 14:08:22 [worker_base.py:654] ########## 488 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175}
+INFO 12-15 14:08:22 [worker_base.py:655] ########## 488 process(rank0) is running on memnode(s): {0, 1}
+INFO 12-15 14:08:32 [parallel_state.py:1077] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-15 14:08:33 [gpu_model_runner.py:1819] Starting to load model ../OctoMed/OctoMed-7B/...
+INFO 12-15 14:08:33 [gpu_model_runner.py:1824] Loading model from scratch...
+INFO 12-15 14:08:33 [rocm.py:288] Using Flash Attention backend on V1 engine. (only supports block size 64)
+
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:08<00:24,  8.02s/it]
+
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:17<00:18,  9.11s/it]
+
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:28<00:09,  9.60s/it]
+
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:30<00:00,  6.68s/it]
+
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:30<00:00,  7.57s/it]
+
+INFO 12-15 14:09:05 [default_loader.py:272] Loading weights took 31.55 seconds
+INFO 12-15 14:09:05 [gpu_model_runner.py:1850] Model loading took 15.6271 GiB and 31.809581 seconds
+INFO 12-15 14:09:05 [gpu_model_runner.py:2302] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
+INFO 12-15 14:09:35 [backends.py:508] Using cache directory: /root/.cache/vllm/torch_compile_cache/a4c96d359e/rank_0_0/backbone for vLLM's torch.compile
+INFO 12-15 14:09:35 [backends.py:519] Dynamo bytecode transform time: 5.88 s
+INFO 12-15 14:09:39 [backends.py:181] Cache the graph of shape None for later use
+INFO 12-15 14:09:57 [backends.py:193] Compiling a graph for general shape takes 21.09 s
+INFO 12-15 14:10:00 [monitor.py:34] torch.compile takes 26.97 s in total
+INFO 12-15 14:10:01 [gpu_worker.py:239] Available KV cache memory: 38.03 GiB
+INFO 12-15 14:10:01 [kv_cache_utils.py:716] GPU KV cache size: 712,064 tokens
+INFO 12-15 14:10:01 [kv_cache_utils.py:720] Maximum concurrency for 32,768 tokens per request: 21.73x
+
Capturing CUDA graph shapes:   0%|          | 0/67 [00:00<?, ?it/s]
Capturing CUDA graph shapes:   1%|▏         | 1/67 [00:00<00:24,  2.67it/s]
Capturing CUDA graph shapes:   3%|▎         | 2/67 [00:00<00:23,  2.77it/s]
Capturing CUDA graph shapes:   4%|▍         | 3/67 [00:01<00:22,  2.81it/s]
Capturing CUDA graph shapes:   6%|▌         | 4/67 [00:01<00:22,  2.81it/s]
Capturing CUDA graph shapes:   7%|▋         | 5/67 [00:01<00:22,  2.78it/s]
Capturing CUDA graph shapes:   9%|▉         | 6/67 [00:02<00:21,  2.81it/s]
Capturing CUDA graph shapes:  10%|█         | 7/67 [00:02<00:21,  2.80it/s]
Capturing CUDA graph shapes:  12%|█▏        | 8/67 [00:02<00:21,  2.73it/s]
Capturing CUDA graph shapes:  13%|█▎        | 9/67 [00:03<00:20,  2.78it/s]
Capturing CUDA graph shapes:  15%|█▍        | 10/67 [00:03<00:20,  2.79it/s]
Capturing CUDA graph shapes:  16%|█▋        | 11/67 [00:03<00:19,  2.83it/s]
Capturing CUDA graph shapes:  18%|█▊        | 12/67 [00:04<00:19,  2.83it/s]
Capturing CUDA graph shapes:  19%|█▉        | 13/67 [00:04<00:18,  2.84it/s]
Capturing CUDA graph shapes:  21%|██        | 14/67 [00:04<00:18,  2.82it/s]
Capturing CUDA graph shapes:  22%|██▏       | 15/67 [00:05<00:18,  2.81it/s]
Capturing CUDA graph shapes:  24%|██▍       | 16/67 [00:05<00:18,  2.83it/s]
Capturing CUDA graph shapes:  25%|██▌       | 17/67 [00:06<00:17,  2.81it/s]
Capturing CUDA graph shapes:  27%|██▋       | 18/67 [00:06<00:17,  2.82it/s]
Capturing CUDA graph shapes:  28%|██▊       | 19/67 [00:06<00:16,  2.87it/s]
Capturing CUDA graph shapes:  30%|██▉       | 20/67 [00:07<00:16,  2.91it/s]
Capturing CUDA graph shapes:  31%|███▏      | 21/67 [00:07<00:15,  2.92it/s]
Capturing CUDA graph shapes:  33%|███▎      | 22/67 [00:07<00:15,  2.95it/s]
Capturing CUDA graph shapes:  34%|███▍      | 23/67 [00:08<00:14,  2.97it/s]
Capturing CUDA graph shapes:  36%|███▌      | 24/67 [00:08<00:14,  2.94it/s]
Capturing CUDA graph shapes:  37%|███▋      | 25/67 [00:08<00:14,  2.93it/s]
Capturing CUDA graph shapes:  39%|███▉      | 26/67 [00:09<00:14,  2.90it/s]
Capturing CUDA graph shapes:  40%|████      | 27/67 [00:09<00:13,  2.89it/s]
Capturing CUDA graph shapes:  42%|████▏     | 28/67 [00:09<00:13,  2.92it/s]
Capturing CUDA graph shapes:  43%|████▎     | 29/67 [00:10<00:12,  2.94it/s]
Capturing CUDA graph shapes:  45%|████▍     | 30/67 [00:10<00:12,  2.93it/s]
Capturing CUDA graph shapes:  46%|████▋     | 31/67 [00:10<00:12,  2.95it/s]
Capturing CUDA graph shapes:  48%|████▊     | 32/67 [00:11<00:11,  2.98it/s]
Capturing CUDA graph shapes:  49%|████▉     | 33/67 [00:11<00:11,  2.98it/s]
Capturing CUDA graph shapes:  51%|█████     | 34/67 [00:11<00:10,  3.01it/s]
Capturing CUDA graph shapes:  52%|█████▏    | 35/67 [00:12<00:10,  3.04it/s]
Capturing CUDA graph shapes:  54%|█████▎    | 36/67 [00:12<00:10,  3.04it/s]
Capturing CUDA graph shapes:  55%|█████▌    | 37/67 [00:12<00:09,  3.06it/s]
Capturing CUDA graph shapes:  57%|█████▋    | 38/67 [00:13<00:09,  3.09it/s]
Capturing CUDA graph shapes:  58%|█████▊    | 39/67 [00:13<00:09,  3.08it/s]
Capturing CUDA graph shapes:  60%|█████▉    | 40/67 [00:13<00:08,  3.09it/s]
Capturing CUDA graph shapes:  61%|██████    | 41/67 [00:14<00:08,  3.06it/s]
Capturing CUDA graph shapes:  63%|██████▎   | 42/67 [00:14<00:08,  3.06it/s]
Capturing CUDA graph shapes:  64%|██████▍   | 43/67 [00:14<00:07,  3.09it/s]
Capturing CUDA graph shapes:  66%|██████▌   | 44/67 [00:15<00:07,  3.13it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 45/67 [00:15<00:07,  3.11it/s]
Capturing CUDA graph shapes:  69%|██████▊   | 46/67 [00:15<00:06,  3.11it/s]
Capturing CUDA graph shapes:  70%|███████   | 47/67 [00:15<00:06,  3.13it/s]
Capturing CUDA graph shapes:  72%|███████▏  | 48/67 [00:16<00:06,  3.07it/s]
Capturing CUDA graph shapes:  73%|███████▎  | 49/67 [00:16<00:05,  3.13it/s]
Capturing CUDA graph shapes:  75%|███████▍  | 50/67 [00:16<00:05,  3.18it/s]
Capturing CUDA graph shapes:  76%|███████▌  | 51/67 [00:17<00:04,  3.20it/s]
Capturing CUDA graph shapes:  78%|███████▊  | 52/67 [00:17<00:04,  3.21it/s]
Capturing CUDA graph shapes:  79%|███████▉  | 53/67 [00:17<00:04,  3.25it/s]
Capturing CUDA graph shapes:  81%|████████  | 54/67 [00:18<00:04,  3.17it/s]
Capturing CUDA graph shapes:  82%|████████▏ | 55/67 [00:18<00:03,  3.16it/s]
Capturing CUDA graph shapes:  84%|████████▎ | 56/67 [00:18<00:03,  3.16it/s]
Capturing CUDA graph shapes:  85%|████████▌ | 57/67 [00:19<00:03,  3.18it/s]
Capturing CUDA graph shapes:  87%|████████▋ | 58/67 [00:19<00:02,  3.20it/s]
Capturing CUDA graph shapes:  88%|████████▊ | 59/67 [00:19<00:02,  3.24it/s]
Capturing CUDA graph shapes:  90%|████████▉ | 60/67 [00:20<00:02,  3.25it/s]
Capturing CUDA graph shapes:  91%|█████████ | 61/67 [00:20<00:01,  3.23it/s]
Capturing CUDA graph shapes:  93%|█████████▎| 62/67 [00:20<00:01,  3.26it/s]
Capturing CUDA graph shapes:  94%|█████████▍| 63/67 [00:20<00:01,  3.31it/s]
Capturing CUDA graph shapes:  96%|█████████▌| 64/67 [00:21<00:00,  3.30it/s]
Capturing CUDA graph shapes:  97%|█████████▋| 65/67 [00:21<00:00,  3.31it/s]
Capturing CUDA graph shapes:  99%|█████████▊| 66/67 [00:21<00:00,  3.27it/s]
Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:22<00:00,  3.27it/s]
Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:22<00:00,  3.02it/s]
+INFO 12-15 14:10:23 [gpu_model_runner.py:2391] Graph capturing finished in 22 secs, took 0.37 GiB
+INFO 12-15 14:10:24 [core.py:174] init engine (profile, create kv cache, warmup model) took 78.36 seconds
+INFO 12-15 14:10:24 [loggers.py:137] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 11126
+WARNING 12-15 14:10:24 [config.py:1408] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
+INFO 12-15 14:10:24 [serving_chat.py:125] Using default chat sampling params from model: {'repetition_penalty': 1.05, 'temperature': 1e-06}
+INFO 12-15 14:10:24 [serving_completion.py:72] Using default completion sampling params from model: {'repetition_penalty': 1.05, 'temperature': 1e-06}
+INFO 12-15 14:10:24 [api_server.py:1457] Starting vLLM API server 0 on http://0.0.0.0:8000
+INFO 12-15 14:10:24 [launcher.py:29] Available routes are:
+INFO 12-15 14:10:24 [launcher.py:37] Route: /openapi.json, Methods: HEAD, GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /docs, Methods: HEAD, GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /docs/oauth2-redirect, Methods: HEAD, GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /redoc, Methods: HEAD, GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /health, Methods: GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /load, Methods: GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /ping, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /ping, Methods: GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /tokenize, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /detokenize, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/models, Methods: GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /version, Methods: GET
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/chat/completions, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/completions, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/embeddings, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /pooling, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /classify, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /score, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/score, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/audio/transcriptions, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/audio/translations, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /rerank, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v1/rerank, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /v2/rerank, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /invocations, Methods: POST
+INFO 12-15 14:10:24 [launcher.py:37] Route: /metrics, Methods: GET
+INFO:     Started server process [38]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO 12-15 14:21:45 [chat_utils.py:444] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.
+WARNING 12-15 14:21:45 [sampling_params.py:344] temperature 1e-06 is less than 0.01, which may cause numerical errors nan or inf in tensors. We have maxed it out to 0.01.
+INFO 12-15 14:21:45 [logger.py:43] Received request chatcmpl-46c2e53655184f288c755e4c36c0d5a6: prompt: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image in one sentence.<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.05, temperature=0.01, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=32739, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: None, prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
+INFO 12-15 14:21:47 [async_llm.py:270] Added request chatcmpl-46c2e53655184f288c755e4c36c0d5a6.
+INFO 12-15 14:21:54 [loggers.py:118] Engine 000: Avg prompt throughput: 37.3 tokens/s, Avg generation throughput: 21.5 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%
+INFO:     127.0.0.1:39298 - "POST /v1/chat/completions HTTP/1.1" 200 OK
+INFO 12-15 14:22:04 [loggers.py:118] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 14.9 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
+INFO 12-15 14:22:14 [loggers.py:118] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# OctoMed-7B
+## 论文
+[OctoMed-7B](https://arxiv.org/pdf/2511.23269)
+## 模型简介
+OctoMed-7B是一款高性能多模态医学推理模型，通过大规模数据治理和基于监督微调（SFT）的方法构建。为支撑可靠的临床推理能力，开发了可扩展的数据处理流程，从DeepSeek-R1和GPT-4o中蒸馏出结构化推理轨迹，构建了迄今规模最大的多模态医学推理数据集，包含超过800万条推理轨迹和68亿响应token。  
+OctoMed-7B以Qwen2.5-VL-7B-Instruct为基座模型，在该精炼数据集上进行训练，在多项分布外医学基准测试中均实现了优异且稳健的性能表现。OctoMed-7B在输出最终答案前，会通过<think>...</think>标记生成内部推理轨迹。通常，该模型在面对难度较高或定义不明确的问题时倾向于延长推理过程，而对于简单查询则保持较短的推理轨迹。医学基准测试性能表现如下：
+
+<div align=center>
+    <img src="./doc/perf.png"/>
+</div>
+
+## 环境依赖
+
+| 软件 | 版本 |
+| :------: | :------: |
+| DTK | 25.04.2 |
+| python | 3.10.12 |
+| transformers | >=4.57.1 |
+| vllm |  0.9.2+das.opt1.dtk25042 |
+| torch | 2.5.1+das.opt1.dtk25042 |
+| triton | 3.1+das.opt1.3c5d12d.dtk25041 |
+| flash_attn | 2.6.1+das.opt1.dtk2504 |
+| flash_mla | 1.0.0+das.opt1.dtk25042 |
+
+当前仅支持镜像:
+- 挂载地址`-v`根据实际模型情况修改
+
+```bash
+docker run -it --shm-size 60g --network=host --name OctoMed --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /opt/hyhal/:/opt/hyhal/:ro -v /path/your_code_path/:/path/your_code_path/  image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.2-py3.10 bash
+```
+更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。
+
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装。
+
+## 数据集
+暂无
+
+## 训练
+暂无
+
+## 推理
+
+### vllm
+#### 单机推理
+可参考vllm_serve.sh脚本
+```bash
+## serve启动
+vllm serve OctoMed/OctoMed-7B/    --trust-remote-code --dtype bfloat16 --max-seq-len-to-capture 32768 -tp 1   --max-model-len 32768
+## client访问
+可参考vllm_cilent.sh
+curl -X POST "http://localhost:8000/v1/chat/completions"   -H "Content-Type: application/json"     --data '{
+                "model": "OctoMed/OctoMed-7B/",
+                "messages": [
+                        {
+                                "role": "user",
+                                "content": [
+                                        {
+                                                "type": "text",
+                                                "text": "Describe this image in one sentence."
+                                        },
+                                        {
+                                                "type": "image_url",
+                                                "image_url": {
+                                                        "url": "https://img-s.msn.cn/tenant/amp/entityid/AA1S6LMz.img?w=640&h=427&m=6"
+                                                }
+                                        }
+                                ]
+                        }
+                ]
+        }'
+```
+
+## 效果展示
+<div align=center>
+    <img src="./doc/result.png"/>
+</div>
+
+### 精度
+DCU与GPU精度一致，推理框架：vllm。
+
+## 预训练权重
+| 模型名称  | 权重大小  | DCU型号  | 最低卡数需求 |下载地址|
+|:-----:|:----------:|:----------:|:---------------------:|:----------:|
+| OctoMed-7B | 7B | K100AI | 1 | [下载地址](https://huggingface.co/OctoMed/OctoMed-7B) |
+
+## 源码仓库及问题反馈
+- https://developer.sourcefind.cn/codes/modelzoo/octomed_vllm
+
+## 参考资料
+- https://huggingface.co/OctoMed/OctoMed-7B
--- a/doc/perf.png
+++ b/doc/perf.png
--- a/doc/result.png
+++ b/doc/result.png
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=1896
+# 模型名称
+modelName=OctoMed_vllm 
+# 模型描述
+modelDescription=OctoMed-7B是一款高性能多模态医学推理模型，通过大规模数据治理和基于监督微调（SFT）的方法构建
+processType=推理
+# 算法类别
+appScenario=多模态
+# 框架类型
+frameType=vllm
+# 加速卡类型
+accelerateType=K100AI
--- a/vllm_cilent.sh
+++ b/vllm_cilent.sh
+curl -X POST "http://localhost:8000/v1/chat/completions"   -H "Content-Type: application/json"     --data '{
+                "model": "OctoMed/OctoMed-7B/",
+                "messages": [
+                        {
+                                "role": "user",
+                                "content": [
+                                        {
+                                                "type": "text",
+                                                "text": "Describe this image in one sentence."
+                                        },
+                                        {
+                                                "type": "image_url",
+                                                "image_url": {
+                                                        "url": "https://img-s.msn.cn/tenant/amp/entityid/AA1S6LMz.img?w=640&h=427&m=6"
+                                                }
+                                        }
+                                ]
+                        }
+                ]
+        }'
\ No newline at end of file
--- a/vllm_serve.sh
+++ b/vllm_serve.sh
+vllm serve OctoMed/OctoMed-7B/    --trust-remote-code --dtype bfloat16 --max-seq-len-to-capture 32768 -tp 1   --max-model-len 32768
\ No newline at end of file