Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c77c1e05
Unverified
Commit
c77c1e05
authored
Nov 07, 2024
by
Chayenne
Committed by
GitHub
Nov 08, 2024
Browse files
fix black in pre-commit (#1940)
parent
dca87ec3
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
617 additions
and
494 deletions
+617
-494
.pre-commit-config.yaml
.pre-commit-config.yaml
+3
-3
docs/backend/native_api.ipynb
docs/backend/native_api.ipynb
+61
-64
docs/backend/offline_engine_api.ipynb
docs/backend/offline_engine_api.ipynb
+24
-24
docs/backend/openai_api_completions.ipynb
docs/backend/openai_api_completions.ipynb
+56
-53
docs/backend/openai_api_embeddings.ipynb
docs/backend/openai_api_embeddings.ipynb
+24
-24
docs/backend/openai_api_vision.ipynb
docs/backend/openai_api_vision.ipynb
+24
-24
docs/conf.py
docs/conf.py
+5
-3
docs/start/send_request.ipynb
docs/start/send_request.ipynb
+35
-37
examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
.../rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
+26
-19
examples/runtime/engine/input_ids.py
examples/runtime/engine/input_ids.py
+1
-0
python/sglang/srt/configs/model_config.py
python/sglang/srt/configs/model_config.py
+4
-2
python/sglang/srt/layers/quantization/base_config.py
python/sglang/srt/layers/quantization/base_config.py
+4
-6
python/sglang/srt/layers/vocab_parallel_embedding.py
python/sglang/srt/layers/vocab_parallel_embedding.py
+214
-148
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+4
-2
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+2
-2
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+25
-22
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+11
-7
python/sglang/srt/metrics/metrics_collector.py
python/sglang/srt/metrics/metrics_collector.py
+59
-16
python/sglang/srt/models/gpt2.py
python/sglang/srt/models/gpt2.py
+30
-36
python/sglang/srt/server.py
python/sglang/srt/server.py
+5
-2
No files found.
.pre-commit-config.yaml
View file @
c77c1e05
...
@@ -30,6 +30,6 @@ repos:
...
@@ -30,6 +30,6 @@ repos:
rev
:
24.10.0
rev
:
24.10.0
hooks
:
hooks
:
-
id
:
black
-
id
:
black
additional_dependencies
:
[
'
.[jupyter]'
]
types
:
[
python
]
types
:
[
python
,
jupyter
]
-
id
:
black-
jupyter
types
_or
:
[
python
,
jupyter
]
types
:
[
jupyter
]
docs/backend/native_api.ipynb
View file @
c77c1e05
...
@@ -34,10 +34,10 @@
...
@@ -34,10 +34,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:08.536886
Z",
"iopub.execute_input": "2024-11-0
7T18:44:42.063503
Z",
"iopub.status.busy": "2024-11-0
5T05:08:08.536763
Z",
"iopub.status.busy": "2024-11-0
7T18:44:42.063379
Z",
"iopub.status.idle": "2024-11-0
5T05:08:34.725831
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.255300
Z",
"shell.execute_reply": "2024-11-0
5T05:08:34.725316
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.254547
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -73,10 +73,10 @@
...
@@ -73,10 +73,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:34.727530
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.258292
Z",
"iopub.status.busy": "2024-11-0
5T05:08:34.727333
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.257710
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.359784
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.611559
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.359090
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.610842
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -101,10 +101,10 @@
...
@@ -101,10 +101,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.362286
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.613911
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.362140
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.613746
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.368711
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.620286
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.368220
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.619779
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -132,10 +132,10 @@
...
@@ -132,10 +132,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.371313
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.622407
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.37087
7Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.62226
7Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.376712
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.628290
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.376230
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.627793
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -164,10 +164,10 @@
...
@@ -164,10 +164,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.378982
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.630585
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.378597
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.630235
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.391820
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.643498
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.391336
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.643007
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -183,10 +183,10 @@
...
@@ -183,10 +183,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.393748
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.645336
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.39360
6Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.64519
6Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.398645
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.650363
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.398145
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.649837
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -211,10 +211,10 @@
...
@@ -211,10 +211,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.400683
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.652212
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.400419
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.652076
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.406146
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.658633
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.405661
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.658119
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -241,10 +241,10 @@
...
@@ -241,10 +241,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.408176
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.660468
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.407884
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.660325
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.413587
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.666476
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.413108
Z"
"shell.execute_reply": "2024-11-0
7T18:45:07.665984
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -271,10 +271,10 @@
...
@@ -271,10 +271,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.416090
Z",
"iopub.execute_input": "2024-11-0
7T18:45:07.668242
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.415793
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.668108
Z",
"iopub.status.idle": "2024-11-0
5T05:08:36.55254
9Z",
"iopub.status.idle": "2024-11-0
7T18:45:08.72570
9Z",
"shell.execute_reply": "2024-11-0
5T05:08:36.551870
Z"
"shell.execute_reply": "2024-11-0
7T18:45:08.725021
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -296,10 +296,10 @@
...
@@ -296,10 +296,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:36.554823
Z",
"iopub.execute_input": "2024-11-0
7T18:45:08.727865
Z",
"iopub.status.busy": "2024-11-0
5T05:08:36.554680
Z",
"iopub.status.busy": "2024-11-0
7T18:45:08.727721
Z",
"iopub.status.idle": "2024-11-0
5T05:08:38.053945
Z",
"iopub.status.idle": "2024-11-0
7T18:45:11.165841
Z",
"shell.execute_reply": "2024-11-0
5T05:08:38.053034
Z"
"shell.execute_reply": "2024-11-0
7T18:45:11.165282
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -335,10 +335,10 @@
...
@@ -335,10 +335,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:38.05
6783Z",
"iopub.execute_input": "2024-11-0
7T18:45:11.1
678
5
3Z",
"iopub.status.busy": "2024-11-0
5T05:08:38.056497
Z",
"iopub.status.busy": "2024-11-0
7T18:45:11.167711
Z",
"iopub.status.idle": "2024-11-0
5T05:09:04.436030
Z",
"iopub.status.idle": "2024-11-0
7T18:45:39.542988
Z",
"shell.execute_reply": "2024-11-0
5T05:09:04.435311
Z"
"shell.execute_reply": "2024-11-0
7T18:45:39.542135
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -360,10 +360,10 @@
...
@@ -360,10 +360,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:04.438987
Z",
"iopub.execute_input": "2024-11-0
7T18:45:39.545416
Z",
"iopub.status.busy": "2024-11-0
5T05:09:04.438568
Z",
"iopub.status.busy": "2024-11-0
7T18:45:39.545005
Z",
"iopub.status.idle": "2024-11-0
5T05:09:04.485291
Z",
"iopub.status.idle": "2024-11-0
7T18:45:39.588793
Z",
"shell.execute_reply": "2024-11-0
5T05:09:04.484829
Z"
"shell.execute_reply": "2024-11-0
7T18:45:39.588054
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -392,10 +392,10 @@
...
@@ -392,10 +392,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:04.487191
Z",
"iopub.execute_input": "2024-11-0
7T18:45:39.590729
Z",
"iopub.status.busy": "2024-11-0
5T05:09:04.486929
Z",
"iopub.status.busy": "2024-11-0
7T18:45:39.590446
Z",
"iopub.status.idle": "2024-11-0
5T05:09:25.553481
Z",
"iopub.status.idle": "2024-11-0
7T18:45:59.660376
Z",
"shell.execute_reply": "2024-11-0
5T05:09:25.552747
Z"
"shell.execute_reply": "2024-11-0
7T18:45:59.659992
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -419,10 +419,10 @@
...
@@ -419,10 +419,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:25.555813
Z",
"iopub.execute_input": "2024-11-0
7T18:45:59.661779
Z",
"iopub.status.busy": "2024-11-0
5T05:09:25.555666
Z",
"iopub.status.busy": "2024-11-0
7T18:45:59.661641
Z",
"iopub.status.idle": "2024-11-0
5T05:09:26.3543
72Z",
"iopub.status.idle": "2024-11-0
7T18:46:00.475
72
6
Z",
"shell.execute_reply": "2024-11-0
5T05:09:26.353
69
3
Z"
"shell.execute_reply": "2024-11-0
7T18:46:00.4752
69Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -445,10 +445,7 @@
...
@@ -445,10 +445,7 @@
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
"\n",
"\n",
"url = \"http://localhost:30030/classify\"\n",
"url = \"http://localhost:30030/classify\"\n",
"data = {\n",
"data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
" \"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \n",
" \"text\": prompts\n",
"}\n",
"\n",
"\n",
"responses = requests.post(url, json=data).json()\n",
"responses = requests.post(url, json=data).json()\n",
"for response in responses:\n",
"for response in responses:\n",
...
@@ -460,10 +457,10 @@
...
@@ -460,10 +457,10 @@
"execution_count": 15,
"execution_count": 15,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:26.356532
Z",
"iopub.execute_input": "2024-11-0
7T18:46:00.477283
Z",
"iopub.status.busy": "2024-11-0
5T05:09:26.356327
Z",
"iopub.status.busy": "2024-11-0
7T18:46:00.477025
Z",
"iopub.status.idle": "2024-11-0
5T05:09:26.396590
Z",
"iopub.status.idle": "2024-11-0
7T18:46:00.525758
Z",
"shell.execute_reply": "2024-11-0
5T05:09:26.395914
Z"
"shell.execute_reply": "2024-11-0
7T18:46:00.525236
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
...
docs/backend/offline_engine_api.ipynb
View file @
c77c1e05
...
@@ -35,10 +35,10 @@
...
@@ -35,10 +35,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:21:27.50302
6Z",
"iopub.execute_input": "2024-11-0
7T18:46:04.78953
6Z",
"iopub.status.busy": "2024-11-0
5T05:21:27.5027
41Z",
"iopub.status.busy": "2024-11-0
7T18:46:04.789
41
8
Z",
"iopub.status.idle": "2024-11-0
5T05:21:49.554631
Z",
"iopub.status.idle": "2024-11-0
7T18:46:27.038169
Z",
"shell.execute_reply": "2024-11-0
5T05:21:49.55369
0Z"
"shell.execute_reply": "2024-11-0
7T18:46:27.03754
0Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -64,10 +64,10 @@
...
@@ -64,10 +64,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:21:49.55827
5Z",
"iopub.execute_input": "2024-11-0
7T18:46:27.04000
5Z",
"iopub.status.busy": "2024-11-0
5T05:21:49.558110
Z",
"iopub.status.busy": "2024-11-0
7T18:46:27.039872
Z",
"iopub.status.idle": "2024-11-0
5T05:21:52.717287
Z",
"iopub.status.idle": "2024-11-0
7T18:46:30.203840
Z",
"shell.execute_reply": "2024-11-0
5T05:21:52.716842
Z"
"shell.execute_reply": "2024-11-0
7T18:46:30.203368
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -99,10 +99,10 @@
...
@@ -99,10 +99,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:21:52.721738
Z",
"iopub.execute_input": "2024-11-0
7T18:46:30.205880
Z",
"iopub.status.busy": "2024-11-0
5T05:21:52.720908
Z",
"iopub.status.busy": "2024-11-0
7T18:46:30.205719
Z",
"iopub.status.idle": "2024-11-0
5T05:22:01.77034
1Z",
"iopub.status.idle": "2024-11-0
7T18:46:39.25656
1Z",
"shell.execute_reply": "2024-11-0
5T05:22:01.76951
0Z"
"shell.execute_reply": "2024-11-0
7T18:46:39.25588
0Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -137,10 +137,10 @@
...
@@ -137,10 +137,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:01.772662
Z",
"iopub.execute_input": "2024-11-0
7T18:46:39.259464
Z",
"iopub.status.busy": "2024-11-0
5T05:22:01.772377
Z",
"iopub.status.busy": "2024-11-0
7T18:46:39.259309
Z",
"iopub.status.idle": "2024-11-0
5T05:22:04.897499
Z",
"iopub.status.idle": "2024-11-0
7T18:46:42.384955
Z",
"shell.execute_reply": "2024-11-0
5T05:22:04.896867
Z"
"shell.execute_reply": "2024-11-0
7T18:46:42.384378
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -179,10 +179,10 @@
...
@@ -179,10 +179,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:04.899754
Z",
"iopub.execute_input": "2024-11-0
7T18:46:42.387431
Z",
"iopub.status.busy": "2024-11-0
5T05:22:04.899478
Z",
"iopub.status.busy": "2024-11-0
7T18:46:42.387279
Z",
"iopub.status.idle": "2024-11-0
5T05:22:13.970245
Z",
"iopub.status.idle": "2024-11-0
7T18:46:51.448572
Z",
"shell.execute_reply": "2024-11-0
5T05:22:13.969779
Z"
"shell.execute_reply": "2024-11-0
7T18:46:51.447781
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -216,10 +216,10 @@
...
@@ -216,10 +216,10 @@
"execution_count": 6,
"execution_count": 6,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:13.972039
Z",
"iopub.execute_input": "2024-11-0
7T18:46:51.451177
Z",
"iopub.status.busy": "2024-11-0
5T05:22:13.971846
Z",
"iopub.status.busy": "2024-11-0
7T18:46:51.450952
Z",
"iopub.status.idle": "2024-11-0
5T05:22:14.027421
Z",
"iopub.status.idle": "2024-11-0
7T18:46:51.497530
Z",
"shell.execute_reply": "2024-11-0
5T05:22:14.027003
Z"
"shell.execute_reply": "2024-11-0
7T18:46:51.496850
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
...
docs/backend/openai_api_completions.ipynb
View file @
c77c1e05
...
@@ -39,10 +39,10 @@
...
@@ -39,10 +39,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:30.637832
Z",
"iopub.execute_input": "2024-11-0
7T18:46:54.813876
Z",
"iopub.status.busy": "2024-11-0
5T05:09:30.637709
Z",
"iopub.status.busy": "2024-11-0
7T18:46:54.813741
Z",
"iopub.status.idle": "2024-11-0
5T05:09:58.830158
Z",
"iopub.status.idle": "2024-11-0
7T18:47:24.015527
Z",
"shell.execute_reply": "2024-11-0
5T05:09:58.829395
Z"
"shell.execute_reply": "2024-11-0
7T18:47:24.014987
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -79,10 +79,10 @@
...
@@ -79,10 +79,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:58.833008
Z",
"iopub.execute_input": "2024-11-0
7T18:47:24.018153
Z",
"iopub.status.busy": "2024-11-0
5T05:09:58.83280
5Z",
"iopub.status.busy": "2024-11-0
7T18:47:24.01775
5Z",
"iopub.status.idle": "2024-11-0
5T05:10:00.187146
Z",
"iopub.status.idle": "2024-11-0
7T18:47:25.374821
Z",
"shell.execute_reply": "2024-11-0
5T05:10:00.18665
7Z"
"shell.execute_reply": "2024-11-0
7T18:47:25.37439
7Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -119,10 +119,10 @@
...
@@ -119,10 +119,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:00.189444
Z",
"iopub.execute_input": "2024-11-0
7T18:47:25.376617
Z",
"iopub.status.busy": "2024-11-0
5T05:10:00.189289
Z",
"iopub.status.busy": "2024-11-0
7T18:47:25.376495
Z",
"iopub.status.idle": "2024-11-0
5T05:10:03.291891
Z",
"iopub.status.idle": "2024-11-0
7T18:47:28.482537
Z",
"shell.execute_reply": "2024-11-0
5T05:10:03.291173
Z"
"shell.execute_reply": "2024-11-0
7T18:47:28.482125
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -165,10 +165,10 @@
...
@@ -165,10 +165,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:03.29438
9Z",
"iopub.execute_input": "2024-11-0
7T18:47:28.48481
9Z",
"iopub.status.busy": "2024-11-0
5T05:10:03.294237
Z",
"iopub.status.busy": "2024-11-0
7T18:47:28.484673
Z",
"iopub.status.idle": "2024-11-0
5T05:10:03.469357
Z",
"iopub.status.idle": "2024-11-0
7T18:47:28.659814
Z",
"shell.execute_reply": "2024-11-0
5T05:10:03.468661
Z"
"shell.execute_reply": "2024-11-0
7T18:47:28.659435
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -198,10 +198,10 @@
...
@@ -198,10 +198,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:03.471573
Z",
"iopub.execute_input": "2024-11-0
7T18:47:28.661844
Z",
"iopub.status.busy": "2024-11-0
5T05:10:03.47143
0Z",
"iopub.status.busy": "2024-11-0
7T18:47:28.66171
0Z",
"iopub.status.idle": "2024-11-0
5T05:10:04.977081
Z",
"iopub.status.idle": "2024-11-0
7T18:47:30.168922
Z",
"shell.execute_reply": "2024-11-0
5T05:10:04.976391
Z"
"shell.execute_reply": "2024-11-0
7T18:47:30.168600
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -234,10 +234,10 @@
...
@@ -234,10 +234,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:04.979428
Z",
"iopub.execute_input": "2024-11-0
7T18:47:30.171319
Z",
"iopub.status.busy": "2024-11-0
5T05:10:04.979272
Z",
"iopub.status.busy": "2024-11-0
7T18:47:30.171176
Z",
"iopub.status.idle": "2024-11-0
5T05:10:08.568761
Z",
"iopub.status.idle": "2024-11-0
7T18:47:33.760113
Z",
"shell.execute_reply": "2024-11-0
5T05:10:08.568355
Z"
"shell.execute_reply": "2024-11-0
7T18:47:33.759713
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -273,10 +273,10 @@
...
@@ -273,10 +273,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:08.571102
Z",
"iopub.execute_input": "2024-11-0
7T18:47:33.762729
Z",
"iopub.status.busy": "2024-11-0
5T05:10:08.570964
Z",
"iopub.status.busy": "2024-11-0
7T18:47:33.762590
Z",
"iopub.status.idle": "2024-11-0
5T05:10:23.214087
Z",
"iopub.status.idle": "2024-11-0
7T18:47:34.255316
Z",
"shell.execute_reply": "2024-11-0
5T05:10:23.213664
Z"
"shell.execute_reply": "2024-11-0
7T18:47:34.254907
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -297,7 +297,10 @@
...
@@ -297,7 +297,10 @@
"response = client.chat.completions.create(\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" messages=[\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"Give me the information of the capital of France in the JSON format.\"},\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
" },\n",
" ],\n",
" ],\n",
" temperature=0,\n",
" temperature=0,\n",
" max_tokens=128,\n",
" max_tokens=128,\n",
...
@@ -322,10 +325,10 @@
...
@@ -322,10 +325,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:23.216229
Z",
"iopub.execute_input": "2024-11-0
7T18:47:34.257393
Z",
"iopub.status.busy": "2024-11-0
5T05:10:23.21607
6Z",
"iopub.status.busy": "2024-11-0
7T18:47:34.25724
6Z",
"iopub.status.idle": "2024-11-0
5T05:10:23.88423
6Z",
"iopub.status.idle": "2024-11-0
7T18:47:34.41350
6Z",
"shell.execute_reply": "2024-11-0
5T05:10:23.883897
Z"
"shell.execute_reply": "2024-11-0
7T18:47:34.413172
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -365,10 +368,10 @@
...
@@ -365,10 +368,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:23.88627
6Z",
"iopub.execute_input": "2024-11-0
7T18:47:34.41481
6Z",
"iopub.status.busy": "2024-11-0
5T05:10:23.886136
Z",
"iopub.status.busy": "2024-11-0
7T18:47:34.414541
Z",
"iopub.status.idle": "2024-11-0
5T05:10:23.905880
Z",
"iopub.status.idle": "2024-11-0
7T18:47:34.431341
Z",
"shell.execute_reply": "2024-11-0
5T05:10:23.905529
Z"
"shell.execute_reply": "2024-11-0
7T18:47:34.431081
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -427,10 +430,10 @@
...
@@ -427,10 +430,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:23.907468
Z",
"iopub.execute_input": "2024-11-0
7T18:47:34.432325
Z",
"iopub.status.busy": "2024-11-0
5T05:10:23.907247
Z",
"iopub.status.busy": "2024-11-0
7T18:47:34.432208
Z",
"iopub.status.idle": "2024-11-0
5T05:10:26.920212
Z",
"iopub.status.idle": "2024-11-0
7T18:47:37.444337
Z",
"shell.execute_reply": "2024-11-0
5T05:10:26.919865
Z"
"shell.execute_reply": "2024-11-0
7T18:47:37.444000
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -482,10 +485,10 @@
...
@@ -482,10 +485,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:26.922675
Z",
"iopub.execute_input": "2024-11-0
7T18:47:37.445894
Z",
"iopub.status.busy": "2024-11-0
5T05:10:26.922413
Z",
"iopub.status.busy": "2024-11-0
7T18:47:37.445744
Z",
"iopub.status.idle": "2024-11-0
5T05:10:51.961703
Z",
"iopub.status.idle": "2024-11-0
7T18:48:02.482532
Z",
"shell.execute_reply": "2024-11-0
5T05:10:51.960846
Z"
"shell.execute_reply": "2024-11-0
7T18:48:02.482042
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -565,10 +568,10 @@
...
@@ -565,10 +568,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:51.964749
Z",
"iopub.execute_input": "2024-11-0
7T18:48:02.485206
Z",
"iopub.status.busy": "2024-11-0
5T05:10:51.964215
Z",
"iopub.status.busy": "2024-11-0
7T18:48:02.485064
Z",
"iopub.status.idle": "2024-11-0
5T05:11:05.023450
Z",
"iopub.status.idle": "2024-11-0
7T18:48:15.521489
Z",
"shell.execute_reply": "2024-11-0
5T05:11:05.023101
Z"
"shell.execute_reply": "2024-11-0
7T18:48:15.521156
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -660,10 +663,10 @@
...
@@ -660,10 +663,10 @@
"execution_count": 13,
"execution_count": 13,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:05.024877
Z",
"iopub.execute_input": "2024-11-0
7T18:48:15.522794
Z",
"iopub.status.busy": "2024-11-0
5T05:11:05.024561
Z",
"iopub.status.busy": "2024-11-0
7T18:48:15.522657
Z",
"iopub.status.idle": "2024-11-0
5T05:11:06.358695
Z",
"iopub.status.idle": "2024-11-0
7T18:48:16.875740
Z",
"shell.execute_reply": "2024-11-0
5T05:11:06.357635
Z"
"shell.execute_reply": "2024-11-0
7T18:48:16.874847
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
...
docs/backend/openai_api_embeddings.ipynb
View file @
c77c1e05
...
@@ -35,10 +35,10 @@
...
@@ -35,10 +35,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:17.227174
Z",
"iopub.execute_input": "2024-11-0
7T18:48:21.128020
Z",
"iopub.status.busy": "2024-11-0
5T05:22:17.226952
Z",
"iopub.status.busy": "2024-11-0
7T18:48:21.127898
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.44579
1Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.31037
1Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.444980
Z"
"shell.execute_reply": "2024-11-0
7T18:48:45.309469
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -72,10 +72,10 @@
...
@@ -72,10 +72,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.448147
Z",
"iopub.execute_input": "2024-11-0
7T18:48:45.313506
Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.447775
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.313123
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.495311
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.364918
Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.495027
Z"
"shell.execute_reply": "2024-11-0
7T18:48:45.364155
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -106,10 +106,10 @@
...
@@ -106,10 +106,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.49666
6Z",
"iopub.execute_input": "2024-11-0
7T18:48:45.36777
6Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.496524
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.367490
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.540687
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.411386
Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.540060
Z"
"shell.execute_reply": "2024-11-0
7T18:48:45.411134
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -140,10 +140,10 @@
...
@@ -140,10 +140,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.542551
Z",
"iopub.execute_input": "2024-11-0
7T18:48:45.412462
Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.542282
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.412351
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.928542
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.768796
Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.928181
Z"
"shell.execute_reply": "2024-11-0
7T18:48:45.768406
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -176,10 +176,10 @@
...
@@ -176,10 +176,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.930093
Z",
"iopub.execute_input": "2024-11-0
7T18:48:45.770227
Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.929954
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.770106
Z",
"iopub.status.idle": "2024-11-0
5T05:22:44.79994
5Z",
"iopub.status.idle": "2024-11-0
7T18:48:47.44706
5Z",
"shell.execute_reply": "2024-11-0
5T05:22:44.799562
Z"
"shell.execute_reply": "2024-11-0
7T18:48:47.446733
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -208,10 +208,10 @@
...
@@ -208,10 +208,10 @@
"execution_count": 6,
"execution_count": 6,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:44.801418
Z",
"iopub.execute_input": "2024-11-0
7T18:48:47.448510
Z",
"iopub.status.busy": "2024-11-0
5T05:22:44.801192
Z",
"iopub.status.busy": "2024-11-0
7T18:48:47.448337
Z",
"iopub.status.idle": "2024-11-0
5T05:22:45.094634
Z",
"iopub.status.idle": "2024-11-0
7T18:48:47.743336
Z",
"shell.execute_reply": "2024-11-0
5T05:22:45.093950
Z"
"shell.execute_reply": "2024-11-0
7T18:48:47.742276
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
...
docs/backend/openai_api_vision.ipynb
View file @
c77c1e05
...
@@ -39,10 +39,10 @@
...
@@ -39,10 +39,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22
:4
9
.3
20999
Z",
"iopub.execute_input": "2024-11-0
7T18:43
:4
7
.3
11708
Z",
"iopub.status.busy": "2024-11-0
5T05:22
:4
9
.3
20880
Z",
"iopub.status.busy": "2024-11-0
7T18:43
:4
7
.3
11517
Z",
"iopub.status.idle": "2024-11-0
5T05:23:21.537478
Z",
"iopub.status.idle": "2024-11-0
7T18:44:18.512576
Z",
"shell.execute_reply": "2024-11-0
5T05:23:21.536956
Z"
"shell.execute_reply": "2024-11-0
7T18:44:18.511909
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -78,10 +78,10 @@
...
@@ -78,10 +78,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:21.539953
Z",
"iopub.execute_input": "2024-11-0
7T18:44:18.515678
Z",
"iopub.status.busy": "2024-11-0
5T05:23:21.539100
Z",
"iopub.status.busy": "2024-11-0
7T18:44:18.515314
Z",
"iopub.status.idle": "2024-11-0
5T05:23
:2
5
.880
1
79Z",
"iopub.status.idle": "2024-11-0
7T18:44
:2
2
.88079
3
Z",
"shell.execute_reply": "2024-11-0
5T05:23
:2
5
.8
79744
Z"
"shell.execute_reply": "2024-11-0
7T18:44
:2
2
.8
80303
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -129,10 +129,10 @@
...
@@ -129,10 +129,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23
:2
5
.88
1742
Z",
"iopub.execute_input": "2024-11-0
7T18:44
:2
2
.88
3309
Z",
"iopub.status.busy": "2024-11-0
5T05:23
:2
5
.88
1595
Z",
"iopub.status.busy": "2024-11-0
7T18:44
:2
2
.88
3160
Z",
"iopub.status.idle": "2024-11-0
5T05:23:26.758503
Z",
"iopub.status.idle": "2024-11-0
7T18:44:27.048810
Z",
"shell.execute_reply": "2024-11-0
5T05:23:26.75
80
8
4Z"
"shell.execute_reply": "2024-11-0
7T18:44:27.04
80
7
4Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -176,10 +176,10 @@
...
@@ -176,10 +176,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:26.760098
Z",
"iopub.execute_input": "2024-11-0
7T18:44:27.051312
Z",
"iopub.status.busy": "2024-11-0
5T05:23:26.759955
Z",
"iopub.status.busy": "2024-11-0
7T18:44:27.051190
Z",
"iopub.status.idle": "2024-11-0
5T05:23:27.849510
Z",
"iopub.status.idle": "2024-11-0
7T18:44:32.358097
Z",
"shell.execute_reply": "2024-11-0
5T05:23:27.849117
Z"
"shell.execute_reply": "2024-11-0
7T18:44:32.357628
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -227,10 +227,10 @@
...
@@ -227,10 +227,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:27.850994
Z",
"iopub.execute_input": "2024-11-0
7T18:44:32.359532
Z",
"iopub.status.busy": "2024-11-0
5T05:23:27.850864
Z",
"iopub.status.busy": "2024-11-0
7T18:44:32.359413
Z",
"iopub.status.idle": "2024-11-0
5T05:23:31.609137
Z",
"iopub.status.idle": "2024-11-0
7T18:44:36.164664
Z",
"shell.execute_reply": "2024-11-0
5T05:23:31.608748
Z"
"shell.execute_reply": "2024-11-0
7T18:44:36.164005
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -276,10 +276,10 @@
...
@@ -276,10 +276,10 @@
"execution_count": 6,
"execution_count": 6,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:31.61068
3Z",
"iopub.execute_input": "2024-11-0
7T18:44:36.16712
3Z",
"iopub.status.busy": "2024-11-0
5T05:23:31.610560
Z",
"iopub.status.busy": "2024-11-0
7T18:44:36.166535
Z",
"iopub.status.idle": "2024-11-0
5T05:23:32.965146
Z",
"iopub.status.idle": "2024-11-0
7T18:44:37.743761
Z",
"shell.execute_reply": "2024-11-0
5T05:23:32.963922
Z"
"shell.execute_reply": "2024-11-0
7T18:44:37.742510
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
...
docs/conf.py
View file @
c77c1e05
...
@@ -31,7 +31,7 @@ extensions = [
...
@@ -31,7 +31,7 @@ extensions = [
]
]
nbsphinx_allow_errors
=
True
nbsphinx_allow_errors
=
True
nbsphinx_execute
=
'
never
'
nbsphinx_execute
=
"
never
"
autosectionlabel_prefix_document
=
True
autosectionlabel_prefix_document
=
True
nbsphinx_allow_directives
=
True
nbsphinx_allow_directives
=
True
...
@@ -49,7 +49,7 @@ myst_enable_extensions = [
...
@@ -49,7 +49,7 @@ myst_enable_extensions = [
myst_heading_anchors
=
3
myst_heading_anchors
=
3
nbsphinx_kernel_name
=
'
python3
'
nbsphinx_kernel_name
=
"
python3
"
nbsphinx_execute_arguments
=
[
nbsphinx_execute_arguments
=
[
"--InlineBackend.figure_formats={'svg', 'pdf'}"
,
"--InlineBackend.figure_formats={'svg', 'pdf'}"
,
"--InlineBackend.rc={'figure.dpi': 96}"
,
"--InlineBackend.rc={'figure.dpi': 96}"
,
...
@@ -130,8 +130,10 @@ html_context = {
...
@@ -130,8 +130,10 @@ html_context = {
html_static_path
=
[
"_static"
]
html_static_path
=
[
"_static"
]
html_css_files
=
[
"css/custom_log.css"
]
html_css_files
=
[
"css/custom_log.css"
]
def
setup
(
app
):
def
setup
(
app
):
app
.
add_css_file
(
'css/custom_log.css'
)
app
.
add_css_file
(
"css/custom_log.css"
)
myst_enable_extensions
=
[
myst_enable_extensions
=
[
"dollarmath"
,
"dollarmath"
,
...
...
docs/start/send_request.ipynb
View file @
c77c1e05
...
@@ -33,10 +33,10 @@
...
@@ -33,10 +33,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:10.680191
Z",
"iopub.execute_input": "2024-11-0
7T18:48:52.032229
Z",
"iopub.status.busy": "2024-11-0
5T05:11:10.6797
10Z",
"iopub.status.busy": "2024-11-0
7T18:48:52.032
10
5
Z",
"iopub.status.idle": "2024-11-0
5T05:11:39.882385
Z",
"iopub.status.idle": "2024-11-0
7T18:49:20.226042
Z",
"shell.execute_reply": "2024-11-0
5T05:11:39.881827
Z"
"shell.execute_reply": "2024-11-0
7T18:49:20.225562
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -49,7 +49,7 @@
...
@@ -49,7 +49,7 @@
")\n",
")\n",
"\n",
"\n",
"server_process = execute_shell_command(\n",
"server_process = execute_shell_command(\n",
"\"\"\"\n",
"
\"\"\"\n",
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
"--port 30000 --host 0.0.0.0\n",
"--port 30000 --host 0.0.0.0\n",
"\"\"\"\n",
"\"\"\"\n",
...
@@ -70,10 +70,10 @@
...
@@ -70,10 +70,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:39.883923
Z",
"iopub.execute_input": "2024-11-0
7T18:49:20.228006
Z",
"iopub.status.busy": "2024-11-0
5T05:11:39.883
72
1
Z",
"iopub.status.busy": "2024-11-0
7T18:49:20.2275
72Z",
"iopub.status.idle": "2024-11-0
5T05:11:40.124980
Z",
"iopub.status.idle": "2024-11-0
7T18:49:20.469885
Z",
"shell.execute_reply": "2024-11-0
5T05:11:40.124557
Z"
"shell.execute_reply": "2024-11-0
7T18:49:20.469518
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -101,10 +101,10 @@
...
@@ -101,10 +101,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:40.126
56
4
Z",
"iopub.execute_input": "2024-11-0
7T18:49:20.4719
56Z",
"iopub.status.busy": "2024-11-0
5T05:11:40.126369
Z",
"iopub.status.busy": "2024-11-0
7T18:49:20.471811
Z",
"iopub.status.idle": "2024-11-0
5T05:11:40.324316
Z",
"iopub.status.idle": "2024-11-0
7T18:49:20.667997
Z",
"shell.execute_reply": "2024-11-0
5T05:11:40.323693
Z"
"shell.execute_reply": "2024-11-0
7T18:49:20.667630
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -115,9 +115,7 @@
...
@@ -115,9 +115,7 @@
"\n",
"\n",
"data = {\n",
"data = {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"messages\": [\n",
" \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
" {\"role\": \"user\", \"content\": \"What is the capital of France?\"}\n",
" ]\n",
"}\n",
"}\n",
"\n",
"\n",
"response = requests.post(url, json=data)\n",
"response = requests.post(url, json=data)\n",
...
@@ -136,10 +134,10 @@
...
@@ -136,10 +134,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:40.327043
Z",
"iopub.execute_input": "2024-11-0
7T18:49:20.669977
Z",
"iopub.status.busy": "2024-11-0
5T05:11:40.326759
Z",
"iopub.status.busy": "2024-11-0
7T18:49:20.669826
Z",
"iopub.status.idle": "2024-11-0
5T05:11:41.687336
Z",
"iopub.status.idle": "2024-11-0
7T18:49:22.004855
Z",
"shell.execute_reply": "2024-11-0
5T05:11:41.686855
Z"
"shell.execute_reply": "2024-11-0
7T18:49:22.004472
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -171,10 +169,10 @@
...
@@ -171,10 +169,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:41.688676
Z",
"iopub.execute_input": "2024-11-0
7T18:49:22.006983
Z",
"iopub.status.busy": "2024-11-0
5T05:11:41.688527
Z",
"iopub.status.busy": "2024-11-0
7T18:49:22.006858
Z",
"iopub.status.idle": "2024-11-0
5T05:11:42.717140
Z",
"iopub.status.idle": "2024-11-0
7T18:49:23.029098
Z",
"shell.execute_reply": "2024-11-0
5T05:11:42.716452
Z"
"shell.execute_reply": "2024-11-0
7T18:49:23.028697
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -197,7 +195,7 @@
...
@@ -197,7 +195,7 @@
"# Handle the streaming output\n",
"# Handle the streaming output\n",
"for chunk in response:\n",
"for chunk in response:\n",
" if chunk.choices[0].delta.content:\n",
" if chunk.choices[0].delta.content:\n",
" print(chunk.choices[0].delta.content, end=
''
, flush=True)"
" print(chunk.choices[0].delta.content, end=
\"\"
, flush=True)"
]
]
},
},
{
{
...
@@ -214,10 +212,10 @@
...
@@ -214,10 +212,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:42.720467
Z",
"iopub.execute_input": "2024-11-0
7T18:49:23.031712
Z",
"iopub.status.busy": "2024-11-0
5T05:11:42.720182
Z",
"iopub.status.busy": "2024-11-0
7T18:49:23.031571
Z",
"iopub.status.idle": "2024-11-0
5T05:11:43.480765
Z",
"iopub.status.idle": "2024-11-0
7T18:49:23.787752
Z",
"shell.execute_reply": "2024-11-0
5T05:11:43.480143
Z"
"shell.execute_reply": "2024-11-0
7T18:49:23.787368
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -250,10 +248,10 @@
...
@@ -250,10 +248,10 @@
"execution_count": null,
"execution_count": null,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:43.483575
Z",
"iopub.execute_input": "2024-11-0
7T18:49:23.789840
Z",
"iopub.status.busy": "2024-11-0
5T05:11:43.483295
Z",
"iopub.status.busy": "2024-11-0
7T18:49:23.789702
Z",
"iopub.status.idle": "2024-11-0
5T05:11:44.242950
Z",
"iopub.status.idle": "2024-11-0
7T18:49:24.545631
Z",
"shell.execute_reply": "2024-11-0
5T05:11:44.242
24
8
Z"
"shell.execute_reply": "2024-11-0
7T18:49:24.545
24
1
Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
@@ -290,10 +288,10 @@
...
@@ -290,10 +288,10 @@
"execution_count": 8,
"execution_count": 8,
"metadata": {
"metadata": {
"execution": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:44.245660
Z",
"iopub.execute_input": "2024-11-0
7T18:49:24.547641
Z",
"iopub.status.busy": "2024-11-0
5T05:11:44.245373
Z",
"iopub.status.busy": "2024-11-0
7T18:49:24.547497
Z",
"iopub.status.idle": "2024-11-0
5T05:11:45.591682
Z",
"iopub.status.idle": "2024-11-0
7T18:49:25.888864
Z",
"shell.execute_reply": "2024-11-0
5T05:11:45.59
11
8
4Z"
"shell.execute_reply": "2024-11-0
7T18:49:25.888
114Z"
}
}
},
},
"outputs": [],
"outputs": [],
...
...
examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
View file @
c77c1e05
...
@@ -80,7 +80,7 @@
...
@@ -80,7 +80,7 @@
"if not os.path.exists(path_qca):\n",
"if not os.path.exists(path_qca):\n",
" !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n",
" !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n",
"\n",
"\n",
"with open(path_qca,
'r'
) as f:\n",
"with open(path_qca,
\"r\"
) as f:\n",
" question_context_answers = json.load(f)\n",
" question_context_answers = json.load(f)\n",
"\n",
"\n",
"chroma_client = chromadb.PersistentClient()\n",
"chroma_client = chromadb.PersistentClient()\n",
...
@@ -88,7 +88,7 @@
...
@@ -88,7 +88,7 @@
"if collection.count() == 0:\n",
"if collection.count() == 0:\n",
" collection.add(\n",
" collection.add(\n",
" documents=[qca[\"context\"] for qca in question_context_answers],\n",
" documents=[qca[\"context\"] for qca in question_context_answers],\n",
" ids=[str(i) for i in range(len(question_context_answers))]\n",
" ids=[str(i) for i in range(len(question_context_answers))]
,
\n",
" )"
" )"
],
],
"metadata": {
"metadata": {
...
@@ -123,7 +123,7 @@
...
@@ -123,7 +123,7 @@
"\n",
"\n",
"load_dotenv()\n",
"load_dotenv()\n",
"\n",
"\n",
"os.environ[
'
TOKENIZERS_PARALLELISM
'
] = \"false\"\n",
"os.environ[
\"
TOKENIZERS_PARALLELISM
\"
] = \"false\"\n",
"\n",
"\n",
"p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n",
"p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n",
"p.integrate_with_sglang()\n",
"p.integrate_with_sglang()\n",
...
@@ -150,10 +150,7 @@
...
@@ -150,10 +150,7 @@
"source": [
"source": [
"@trace\n",
"@trace\n",
"def retrieval(question: str) -> List[str]:\n",
"def retrieval(question: str) -> List[str]:\n",
" return collection.query(\n",
" return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
" query_texts=[question],\n",
" n_results=1\n",
" )['documents'][0]"
],
],
"metadata": {
"metadata": {
"collapsed": false
"collapsed": false
...
@@ -176,7 +173,9 @@
...
@@ -176,7 +173,9 @@
"@function\n",
"@function\n",
"def generation_sglang(s, question: str, *context: str):\n",
"def generation_sglang(s, question: str, *context: str):\n",
" context = \"\\n\".join(context)\n",
" context = \"\\n\".join(context)\n",
" s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n",
" s += user(\n",
" f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
" )\n",
" s += assistant(gen(\"answer\"))\n",
" s += assistant(gen(\"answer\"))\n",
"\n",
"\n",
"\n",
"\n",
...
@@ -223,7 +222,9 @@
...
@@ -223,7 +222,9 @@
" return generation(question, *contexts)\n",
" return generation(question, *contexts)\n",
"\n",
"\n",
"\n",
"\n",
"rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")"
"rag_pipeline(\n",
" \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
")"
]
]
},
},
{
{
...
@@ -271,7 +272,10 @@
...
@@ -271,7 +272,10 @@
"execution_count": null,
"execution_count": null,
"outputs": [],
"outputs": [],
"source": [
"source": [
"from parea.evals.rag import context_query_relevancy_factory, percent_target_supported_by_context_factory\n",
"from parea.evals.rag import (\n",
" context_query_relevancy_factory,\n",
" percent_target_supported_by_context_factory,\n",
")\n",
"\n",
"\n",
"\n",
"\n",
"context_relevancy_eval = context_query_relevancy_factory()\n",
"context_relevancy_eval = context_query_relevancy_factory()\n",
...
@@ -280,10 +284,7 @@
...
@@ -280,10 +284,7 @@
"\n",
"\n",
"@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n",
"@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n",
"def retrieval(question: str) -> List[str]:\n",
"def retrieval(question: str) -> List[str]:\n",
" return collection.query(\n",
" return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
" query_texts=[question],\n",
" n_results=1\n",
" )['documents'][0]"
],
],
"metadata": {
"metadata": {
"collapsed": false
"collapsed": false
...
@@ -310,10 +311,13 @@
...
@@ -310,10 +311,13 @@
"answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n",
"answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n",
"answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n",
"answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n",
"\n",
"\n",
"\n",
"@function\n",
"@function\n",
"def generation_sglang(s, question: str, *context: str):\n",
"def generation_sglang(s, question: str, *context: str):\n",
" context = \"\\n\".join(context)\n",
" context = \"\\n\".join(context)\n",
" s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n",
" s += user(\n",
" f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
" )\n",
" s += assistant(gen(\"answer\", max_tokens=1_000))\n",
" s += assistant(gen(\"answer\", max_tokens=1_000))\n",
"\n",
"\n",
"\n",
"\n",
...
@@ -357,7 +361,9 @@
...
@@ -357,7 +361,9 @@
" return generation(question, *contexts)\n",
" return generation(question, *contexts)\n",
"\n",
"\n",
"\n",
"\n",
"rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")"
"rag_pipeline(\n",
" \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
")"
],
],
"metadata": {
"metadata": {
"collapsed": false
"collapsed": false
...
@@ -402,6 +408,7 @@
...
@@ -402,6 +408,7 @@
"source": [
"source": [
"!pip install nest-asyncio\n",
"!pip install nest-asyncio\n",
"import nest_asyncio\n",
"import nest_asyncio\n",
"\n",
"nest_asyncio.apply()"
"nest_asyncio.apply()"
],
],
"metadata": {
"metadata": {
...
@@ -461,7 +468,7 @@
...
@@ -461,7 +468,7 @@
],
],
"source": [
"source": [
"e = p.experiment(\n",
"e = p.experiment(\n",
"
'
RAG
'
,\n",
"
\"
RAG
\"
,\n",
" data=[\n",
" data=[\n",
" {\n",
" {\n",
" \"question\": qca[\"question\"],\n",
" \"question\": qca[\"question\"],\n",
...
@@ -469,7 +476,7 @@
...
@@ -469,7 +476,7 @@
" }\n",
" }\n",
" for qca in question_context_answers\n",
" for qca in question_context_answers\n",
" ],\n",
" ],\n",
" func=rag_pipeline\n",
" func=rag_pipeline
,
\n",
").run()"
").run()"
],
],
"metadata": {
"metadata": {
...
...
examples/runtime/engine/input_ids.py
View file @
c77c1e05
...
@@ -7,6 +7,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
...
@@ -7,6 +7,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
MODEL_PATH
=
"meta-llama/Llama-3.1-8B-Instruct"
MODEL_PATH
=
"meta-llama/Llama-3.1-8B-Instruct"
def
main
():
def
main
():
# Sample prompts.
# Sample prompts.
prompts
=
[
prompts
=
[
...
...
python/sglang/srt/configs/model_config.py
View file @
c77c1e05
...
@@ -39,7 +39,7 @@ class ModelConfig:
...
@@ -39,7 +39,7 @@ class ModelConfig:
revision
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
context_length
:
Optional
[
int
]
=
None
,
context_length
:
Optional
[
int
]
=
None
,
model_override_args
:
Optional
[
dict
]
=
None
,
model_override_args
:
Optional
[
dict
]
=
None
,
is_embedding
:
Optional
[
bool
]
=
None
is_embedding
:
Optional
[
bool
]
=
None
,
)
->
None
:
)
->
None
:
# Parse args
# Parse args
self
.
model_override_args
=
json
.
loads
(
model_override_args
)
self
.
model_override_args
=
json
.
loads
(
model_override_args
)
...
@@ -52,7 +52,9 @@ class ModelConfig:
...
@@ -52,7 +52,9 @@ class ModelConfig:
self
.
hf_text_config
=
get_hf_text_config
(
self
.
hf_config
)
self
.
hf_text_config
=
get_hf_text_config
(
self
.
hf_config
)
# Check model type
# Check model type
self
.
is_generation
=
is_generation_model
(
self
.
hf_config
.
architectures
,
is_embedding
)
self
.
is_generation
=
is_generation_model
(
self
.
hf_config
.
architectures
,
is_embedding
)
self
.
is_multimodal
=
is_multimodal_model
(
self
.
hf_config
.
architectures
)
self
.
is_multimodal
=
is_multimodal_model
(
self
.
hf_config
.
architectures
)
self
.
is_encoder_decoder
=
is_encoder_decoder_model
(
self
.
hf_config
.
architectures
)
self
.
is_encoder_decoder
=
is_encoder_decoder_model
(
self
.
hf_config
.
architectures
)
...
...
python/sglang/srt/layers/quantization/base_config.py
View file @
c77c1e05
...
@@ -122,16 +122,14 @@ class QuantizationConfig(ABC):
...
@@ -122,16 +122,14 @@ class QuantizationConfig(ABC):
"""
"""
raise
NotImplementedError
raise
NotImplementedError
def
method_has_implemented_embedding
(
method_class
:
Type
[
QuantizeMethodBase
])
->
bool
:
def
method_has_implemented_embedding
(
method_class
:
Type
[
QuantizeMethodBase
])
->
bool
:
"""
"""
Not all quant methods have embedding implemented, so we need to check that
Not all quant methods have embedding implemented, so we need to check that
it exists for our given method. We check this by making sure the function
it exists for our given method. We check this by making sure the function
has been changed from the base implementation.
has been changed from the base implementation.
"""
"""
base_embedding
=
inspect
.
getattr_static
(
QuantizeMethodBase
,
"embedding"
,
base_embedding
=
inspect
.
getattr_static
(
QuantizeMethodBase
,
"embedding"
,
None
)
None
)
class_embedding
=
inspect
.
getattr_static
(
method_class
,
"embedding"
,
None
)
class_embedding
=
inspect
.
getattr_static
(
method_class
,
"embedding"
,
None
)
return
(
class_embedding
is
not
None
return
class_embedding
is
not
None
and
class_embedding
is
not
base_embedding
and
class_embedding
is
not
base_embedding
)
python/sglang/srt/layers/vocab_parallel_embedding.py
View file @
c77c1e05
...
@@ -27,59 +27,67 @@ DEFAULT_VOCAB_PADDING_SIZE = 64
...
@@ -27,59 +27,67 @@ DEFAULT_VOCAB_PADDING_SIZE = 64
class
UnquantizedEmbeddingMethod
(
QuantizeMethodBase
):
class
UnquantizedEmbeddingMethod
(
QuantizeMethodBase
):
"""Unquantized method for embeddings."""
"""Unquantized method for embeddings."""
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
input_size_per_partition
:
int
,
output_partition_sizes
:
List
[
int
],
input_size
:
int
,
output_partition_sizes
:
List
[
int
],
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
input_size
:
int
,
**
extra_weight_attrs
):
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
):
"""Create weights for embedding layer."""
"""Create weights for embedding layer."""
weight
=
Parameter
(
torch
.
empty
(
sum
(
output_partition_sizes
),
weight
=
Parameter
(
torch
.
empty
(
sum
(
output_partition_sizes
),
input_size_per_partition
,
input_size_per_partition
,
dtype
=
params_dtype
),
dtype
=
params_dtype
,
requires_grad
=
False
)
),
requires_grad
=
False
,
)
set_weight_attrs
(
weight
,
{
"input_dim"
:
1
,
"output_dim"
:
0
})
set_weight_attrs
(
weight
,
{
"input_dim"
:
1
,
"output_dim"
:
0
})
layer
.
register_parameter
(
"weight"
,
weight
)
layer
.
register_parameter
(
"weight"
,
weight
)
set_weight_attrs
(
weight
,
extra_weight_attrs
)
set_weight_attrs
(
weight
,
extra_weight_attrs
)
def
apply
(
self
,
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
return
F
.
linear
(
x
,
layer
.
weight
,
bias
)
return
F
.
linear
(
x
,
layer
.
weight
,
bias
)
def
embedding
(
self
,
layer
:
torch
.
nn
.
Module
,
def
embedding
(
self
,
layer
:
torch
.
nn
.
Module
,
input_
:
torch
.
Tensor
)
->
torch
.
Tensor
:
input_
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
F
.
embedding
(
input_
,
layer
.
weight
)
return
F
.
embedding
(
input_
,
layer
.
weight
)
def
pad_vocab_size
(
vocab_size
:
int
,
def
pad_vocab_size
(
vocab_size
:
int
,
pad_to
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
)
->
int
:
pad_to
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
)
->
int
:
"""Pad the vocab size to the given value."""
"""Pad the vocab size to the given value."""
return
((
vocab_size
+
pad_to
-
1
)
//
pad_to
)
*
pad_to
return
((
vocab_size
+
pad_to
-
1
)
//
pad_to
)
*
pad_to
def
vocab_range_from_per_partition_vocab_size
(
def
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
:
int
,
per_partition_vocab_size
:
int
,
rank
:
int
,
offset
:
int
=
0
rank
:
int
,
)
->
Sequence
[
int
]:
offset
:
int
=
0
)
->
Sequence
[
int
]:
index_f
=
rank
*
per_partition_vocab_size
index_f
=
rank
*
per_partition_vocab_size
index_l
=
index_f
+
per_partition_vocab_size
index_l
=
index_f
+
per_partition_vocab_size
return
index_f
+
offset
,
index_l
+
offset
return
index_f
+
offset
,
index_l
+
offset
def
vocab_range_from_global_vocab_size
(
global_vocab_size
:
int
,
def
vocab_range_from_global_vocab_size
(
rank
:
int
,
global_vocab_size
:
int
,
rank
:
int
,
world_size
:
int
,
offset
:
int
=
0
world_size
:
int
,
)
->
Sequence
[
int
]:
offset
:
int
=
0
)
->
Sequence
[
int
]:
per_partition_vocab_size
=
divide
(
global_vocab_size
,
world_size
)
per_partition_vocab_size
=
divide
(
global_vocab_size
,
world_size
)
return
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
return
vocab_range_from_per_partition_vocab_size
(
rank
,
per_partition_vocab_size
,
rank
,
offset
=
offset
offset
=
offset
)
)
@
dataclass
@
dataclass
class
VocabParallelEmbeddingShardIndices
:
class
VocabParallelEmbeddingShardIndices
:
"""Indices for a shard of a vocab parallel embedding."""
"""Indices for a shard of a vocab parallel embedding."""
padded_org_vocab_start_index
:
int
padded_org_vocab_start_index
:
int
padded_org_vocab_end_index
:
int
padded_org_vocab_end_index
:
int
padded_added_vocab_start_index
:
int
padded_added_vocab_start_index
:
int
...
@@ -100,13 +108,11 @@ class VocabParallelEmbeddingShardIndices:
...
@@ -100,13 +108,11 @@ class VocabParallelEmbeddingShardIndices:
@
property
@
property
def
num_org_elements_padded
(
self
)
->
int
:
def
num_org_elements_padded
(
self
)
->
int
:
return
(
self
.
padded_org_vocab_end_index
-
return
self
.
padded_org_vocab_end_index
-
self
.
padded_org_vocab_start_index
self
.
padded_org_vocab_start_index
)
@
property
@
property
def
num_added_elements_padded
(
self
)
->
int
:
def
num_added_elements_padded
(
self
)
->
int
:
return
(
self
.
padded_added_vocab_end_index
-
return
self
.
padded_added_vocab_end_index
-
self
.
padded_added_vocab_start_index
self
.
padded_added_vocab_start_index
)
@
property
@
property
def
num_org_vocab_padding
(
self
)
->
int
:
def
num_org_vocab_padding
(
self
)
->
int
:
...
@@ -122,17 +128,14 @@ class VocabParallelEmbeddingShardIndices:
...
@@ -122,17 +128,14 @@ class VocabParallelEmbeddingShardIndices:
def
__post_init__
(
self
):
def
__post_init__
(
self
):
# sanity checks
# sanity checks
assert
(
self
.
padded_org_vocab_start_index
<=
assert
self
.
padded_org_vocab_start_index
<=
self
.
padded_org_vocab_end_index
self
.
padded_org_vocab_end_index
)
assert
self
.
padded_added_vocab_start_index
<=
self
.
padded_added_vocab_end_index
assert
(
self
.
padded_added_vocab_start_index
<=
self
.
padded_added_vocab_end_index
)
assert
self
.
org_vocab_start_index
<=
self
.
org_vocab_end_index
assert
self
.
org_vocab_start_index
<=
self
.
org_vocab_end_index
assert
self
.
added_vocab_start_index
<=
self
.
added_vocab_end_index
assert
self
.
added_vocab_start_index
<=
self
.
added_vocab_end_index
assert
self
.
org_vocab_start_index
<=
self
.
padded_org_vocab_start_index
assert
self
.
org_vocab_start_index
<=
self
.
padded_org_vocab_start_index
assert
(
self
.
added_vocab_start_index
<=
assert
self
.
added_vocab_start_index
<=
self
.
padded_added_vocab_start_index
self
.
padded_added_vocab_start_index
)
assert
self
.
org_vocab_end_index
<=
self
.
padded_org_vocab_end_index
assert
self
.
org_vocab_end_index
<=
self
.
padded_org_vocab_end_index
assert
self
.
added_vocab_end_index
<=
self
.
padded_added_vocab_end_index
assert
self
.
added_vocab_end_index
<=
self
.
padded_added_vocab_end_index
...
@@ -142,20 +145,27 @@ class VocabParallelEmbeddingShardIndices:
...
@@ -142,20 +145,27 @@ class VocabParallelEmbeddingShardIndices:
@
torch
.
jit
.
script
@
torch
.
jit
.
script
def
get_masked_input_and_mask
(
def
get_masked_input_and_mask
(
input_
:
torch
.
Tensor
,
org_vocab_start_index
:
int
,
input_
:
torch
.
Tensor
,
org_vocab_end_index
:
int
,
num_org_vocab_padding
:
int
,
org_vocab_start_index
:
int
,
org_vocab_end_index
:
int
,
num_org_vocab_padding
:
int
,
added_vocab_start_index
:
int
,
added_vocab_start_index
:
int
,
added_vocab_end_index
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
added_vocab_end_index
:
int
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# torch.jit.script will fuse all of the pointwise ops below
# torch.jit.script will fuse all of the pointwise ops below
# into a single kernel, making it very fast
# into a single kernel, making it very fast
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
input_
<
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
input_
<
org_vocab_end_index
)
org_vocab_end_index
)
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
input_
<
added_vocab_end_index
)
input_
<
added_vocab_end_index
added_offset
=
added_vocab_start_index
-
(
)
org_vocab_end_index
-
org_vocab_start_index
)
-
num_org_vocab_padding
added_offset
=
(
valid_offset
=
(
org_vocab_start_index
*
added_vocab_start_index
org_vocab_mask
)
+
(
added_offset
*
added_vocab_mask
)
-
(
org_vocab_end_index
-
org_vocab_start_index
)
-
num_org_vocab_padding
)
valid_offset
=
(
org_vocab_start_index
*
org_vocab_mask
)
+
(
added_offset
*
added_vocab_mask
)
vocab_mask
=
org_vocab_mask
|
added_vocab_mask
vocab_mask
=
org_vocab_mask
|
added_vocab_mask
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
return
input_
,
~
vocab_mask
return
input_
,
~
vocab_mask
...
@@ -200,7 +210,8 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -200,7 +210,8 @@ class VocabParallelEmbedding(torch.nn.Module):
prefix: full name of the layer in the state dict
prefix: full name of the layer in the state dict
"""
# noqa: E501
"""
# noqa: E501
def
__init__
(
self
,
def
__init__
(
self
,
num_embeddings
:
int
,
num_embeddings
:
int
,
embedding_dim
:
int
,
embedding_dim
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
...
@@ -208,7 +219,8 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -208,7 +219,8 @@ class VocabParallelEmbedding(torch.nn.Module):
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
enable_tp
:
bool
=
True
):
enable_tp
:
bool
=
True
,
):
super
().
__init__
()
super
().
__init__
()
self
.
enable_tp
=
enable_tp
self
.
enable_tp
=
enable_tp
...
@@ -223,18 +235,22 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -223,18 +235,22 @@ class VocabParallelEmbedding(torch.nn.Module):
self
.
padding_size
=
padding_size
self
.
padding_size
=
padding_size
self
.
org_vocab_size
=
org_num_embeddings
or
num_embeddings
self
.
org_vocab_size
=
org_num_embeddings
or
num_embeddings
num_added_embeddings
=
num_embeddings
-
self
.
org_vocab_size
num_added_embeddings
=
num_embeddings
-
self
.
org_vocab_size
self
.
org_vocab_size_padded
=
pad_vocab_size
(
self
.
org_vocab_size
,
self
.
org_vocab_size_padded
=
pad_vocab_size
(
self
.
padding_size
)
self
.
org_vocab_size
,
self
.
padding_size
)
self
.
num_embeddings_padded
=
pad_vocab_size
(
self
.
num_embeddings_padded
=
pad_vocab_size
(
self
.
org_vocab_size_padded
+
num_added_embeddings
,
self
.
org_vocab_size_padded
+
num_added_embeddings
,
self
.
padding_size
self
.
padding_size
)
)
assert
self
.
org_vocab_size_padded
<=
self
.
num_embeddings_padded
assert
self
.
org_vocab_size_padded
<=
self
.
num_embeddings_padded
self
.
shard_indices
=
self
.
_get_indices
(
self
.
num_embeddings_padded
,
self
.
shard_indices
=
self
.
_get_indices
(
self
.
num_embeddings_padded
,
self
.
org_vocab_size_padded
,
self
.
org_vocab_size_padded
,
self
.
num_embeddings
,
self
.
num_embeddings
,
self
.
org_vocab_size
,
tp_rank
,
self
.
org_vocab_size
,
self
.
tp_size
)
tp_rank
,
self
.
tp_size
,
)
self
.
embedding_dim
=
embedding_dim
self
.
embedding_dim
=
embedding_dim
linear_method
=
None
linear_method
=
None
...
@@ -248,11 +264,13 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -248,11 +264,13 @@ class VocabParallelEmbedding(torch.nn.Module):
# layer type like ParallelLMHead, this is not important.
# layer type like ParallelLMHead, this is not important.
is_embedding_layer
=
type
(
self
.
__class__
)
is
VocabParallelEmbedding
is_embedding_layer
=
type
(
self
.
__class__
)
is
VocabParallelEmbedding
linear_method_implements_embedding
=
method_has_implemented_embedding
(
linear_method_implements_embedding
=
method_has_implemented_embedding
(
type
(
linear_method
))
type
(
linear_method
)
)
if
is_embedding_layer
and
not
linear_method_implements_embedding
:
if
is_embedding_layer
and
not
linear_method_implements_embedding
:
raise
NotImplementedError
(
raise
NotImplementedError
(
f
"The class
{
type
(
linear_method
).
__name__
}
must implement "
f
"The class
{
type
(
linear_method
).
__name__
}
must implement "
"the 'embedding' method, see UnquantizedEmbeddingMethod."
)
"the 'embedding' method, see UnquantizedEmbeddingMethod."
)
self
.
linear_method
:
QuantizeMethodBase
=
linear_method
self
.
linear_method
:
QuantizeMethodBase
=
linear_method
...
@@ -260,53 +278,68 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -260,53 +278,68 @@ class VocabParallelEmbedding(torch.nn.Module):
params_dtype
=
torch
.
get_default_dtype
()
params_dtype
=
torch
.
get_default_dtype
()
# Divide the weight matrix along the vocaburaly dimension.
# Divide the weight matrix along the vocaburaly dimension.
self
.
num_added_embeddings
=
self
.
num_embeddings
-
self
.
org_vocab_size
self
.
num_added_embeddings
=
self
.
num_embeddings
-
self
.
org_vocab_size
self
.
num_embeddings_per_partition
=
divide
(
self
.
num_embeddings_padded
,
self
.
num_embeddings_per_partition
=
divide
(
self
.
tp_size
)
self
.
num_embeddings_padded
,
self
.
tp_size
assert
(
self
.
shard_indices
.
num_elements_padded
==
)
self
.
num_embeddings_per_partition
)
assert
(
self
.
shard_indices
.
num_elements_padded
==
self
.
num_embeddings_per_partition
)
self
.
num_org_embeddings_per_partition
=
(
self
.
num_org_embeddings_per_partition
=
(
self
.
shard_indices
.
org_vocab_end_index
-
self
.
shard_indices
.
org_vocab_end_index
self
.
shard_indices
.
org_vocab_start_index
)
-
self
.
shard_indices
.
org_vocab_start_index
)
self
.
num_added_embeddings_per_partition
=
(
self
.
num_added_embeddings_per_partition
=
(
self
.
shard_indices
.
added_vocab_end_index
-
self
.
shard_indices
.
added_vocab_end_index
self
.
shard_indices
.
added_vocab_start_index
)
-
self
.
shard_indices
.
added_vocab_start_index
)
self
.
linear_method
.
create_weights
(
self
,
self
.
linear_method
.
create_weights
(
self
,
self
.
embedding_dim
,
self
.
embedding_dim
,
[
self
.
num_embeddings_per_partition
],
[
self
.
num_embeddings_per_partition
],
self
.
embedding_dim
,
self
.
embedding_dim
,
self
.
num_embeddings_padded
,
self
.
num_embeddings_padded
,
params_dtype
=
params_dtype
,
params_dtype
=
params_dtype
,
weight_loader
=
self
.
weight_loader
)
weight_loader
=
self
.
weight_loader
,
)
@
classmethod
@
classmethod
def
_get_indices
(
cls
,
vocab_size_padded
:
int
,
org_vocab_size_padded
:
int
,
def
_get_indices
(
vocab_size
:
int
,
org_vocab_size
:
int
,
tp_rank
:
int
,
cls
,
tp_size
:
int
)
->
VocabParallelEmbeddingShardIndices
:
vocab_size_padded
:
int
,
org_vocab_size_padded
:
int
,
vocab_size
:
int
,
org_vocab_size
:
int
,
tp_rank
:
int
,
tp_size
:
int
,
)
->
VocabParallelEmbeddingShardIndices
:
"""Get start and end indices for vocab parallel embedding, following the
"""Get start and end indices for vocab parallel embedding, following the
layout outlined in the class docstring, based on the given tp_rank and
layout outlined in the class docstring, based on the given tp_rank and
tp_size."""
tp_size."""
num_added_embeddings_padded
=
vocab_size_padded
-
org_vocab_size_padded
num_added_embeddings_padded
=
vocab_size_padded
-
org_vocab_size_padded
padded_org_vocab_start_index
,
padded_org_vocab_end_index
=
(
padded_org_vocab_start_index
,
padded_org_vocab_end_index
=
(
vocab_range_from_global_vocab_size
(
org_vocab_size_padded
,
tp_rank
,
vocab_range_from_global_vocab_size
(
org_vocab_size_padded
,
tp_rank
,
tp_size
)
tp_size
)
)
)
padded_added_vocab_start_index
,
padded_added_vocab_end_index
=
(
padded_added_vocab_start_index
,
padded_added_vocab_end_index
=
(
vocab_range_from_global_vocab_size
(
num_added_embeddings_padded
,
vocab_range_from_global_vocab_size
(
tp_rank
,
num_added_embeddings_padded
,
tp_rank
,
tp_size
,
offset
=
org_vocab_size
tp_size
,
)
offset
=
org_vocab_size
)
)
)
# remove padding
# remove padding
org_vocab_start_index
=
min
(
padded_org_vocab_start_index
,
org_vocab_start_index
=
min
(
padded_org_vocab_start_index
,
org_vocab_size
)
org_vocab_size
)
org_vocab_end_index
=
min
(
padded_org_vocab_end_index
,
org_vocab_size
)
org_vocab_end_index
=
min
(
padded_org_vocab_end_index
,
org_vocab_size
)
added_vocab_start_index
=
min
(
padded_added_vocab_start_index
,
added_vocab_start_index
=
min
(
padded_added_vocab_start_index
,
vocab_size
)
vocab_size
)
added_vocab_end_index
=
min
(
padded_added_vocab_end_index
,
vocab_size
)
added_vocab_end_index
=
min
(
padded_added_vocab_end_index
,
vocab_size
)
return
VocabParallelEmbeddingShardIndices
(
return
VocabParallelEmbeddingShardIndices
(
padded_org_vocab_start_index
,
padded_org_vocab_end_index
,
padded_org_vocab_start_index
,
padded_added_vocab_start_index
,
padded_added_vocab_end_index
,
padded_org_vocab_end_index
,
org_vocab_start_index
,
org_vocab_end_index
,
padded_added_vocab_start_index
,
added_vocab_start_index
,
added_vocab_end_index
)
padded_added_vocab_end_index
,
org_vocab_start_index
,
org_vocab_end_index
,
added_vocab_start_index
,
added_vocab_end_index
,
)
def
get_sharded_to_full_mapping
(
self
)
->
Optional
[
List
[
int
]]:
def
get_sharded_to_full_mapping
(
self
)
->
Optional
[
List
[
int
]]:
"""Get a mapping that can be used to reindex the gathered
"""Get a mapping that can be used to reindex the gathered
...
@@ -326,32 +359,49 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -326,32 +359,49 @@ class VocabParallelEmbedding(torch.nn.Module):
added_embeddings
:
List
[
int
]
=
[]
added_embeddings
:
List
[
int
]
=
[]
padding
:
List
[
int
]
=
[]
padding
:
List
[
int
]
=
[]
for
tp_rank
in
range
(
self
.
tp_size
):
for
tp_rank
in
range
(
self
.
tp_size
):
shard_indices
=
self
.
_get_indices
(
self
.
num_embeddings_padded
,
shard_indices
=
self
.
_get_indices
(
self
.
num_embeddings_padded
,
self
.
org_vocab_size_padded
,
self
.
org_vocab_size_padded
,
self
.
num_embeddings
,
self
.
num_embeddings
,
self
.
org_vocab_size
,
tp_rank
,
self
.
org_vocab_size
,
self
.
tp_size
)
tp_rank
,
self
.
tp_size
,
)
range_start
=
self
.
num_embeddings_per_partition
*
tp_rank
range_start
=
self
.
num_embeddings_per_partition
*
tp_rank
range_end
=
self
.
num_embeddings_per_partition
*
(
tp_rank
+
1
)
range_end
=
self
.
num_embeddings_per_partition
*
(
tp_rank
+
1
)
base_embeddings
.
extend
(
base_embeddings
.
extend
(
range
(
range_start
,
range
(
range_start
,
range_start
+
shard_indices
.
num_org_elements
)
range_start
+
shard_indices
.
num_org_elements
)
)
)
padding
.
extend
(
padding
.
extend
(
range
(
range_start
+
shard_indices
.
num_org_elements
,
range
(
range_start
+
shard_indices
.
num_org_elements_padded
))
range_start
+
shard_indices
.
num_org_elements
,
range_start
+
shard_indices
.
num_org_elements_padded
,
)
)
added_embeddings
.
extend
(
added_embeddings
.
extend
(
range
(
range
(
range_start
+
shard_indices
.
num_org_elements_padded
,
range_start
+
shard_indices
.
num_org_elements_padded
,
range_start
+
shard_indices
.
num_org_elements_padded
+
range_start
shard_indices
.
num_added_elements
))
+
shard_indices
.
num_org_elements_padded
+
shard_indices
.
num_added_elements
,
)
)
padding
.
extend
(
padding
.
extend
(
range
(
range
(
range_start
+
shard_indices
.
num_org_elements_padded
+
range_start
shard_indices
.
num_added_elements
,
+
shard_indices
.
num_org_elements_padded
range_start
+
shard_indices
.
num_org_elements_padded
+
+
shard_indices
.
num_added_elements
,
shard_indices
.
num_added_elements_padded
))
range_start
assert
(
range_start
+
shard_indices
.
num_org_elements_padded
+
+
shard_indices
.
num_org_elements_padded
shard_indices
.
num_added_elements_padded
==
range_end
)
+
shard_indices
.
num_added_elements_padded
,
)
)
assert
(
range_start
+
shard_indices
.
num_org_elements_padded
+
shard_indices
.
num_added_elements_padded
==
range_end
)
ret
=
base_embeddings
+
added_embeddings
+
padding
ret
=
base_embeddings
+
added_embeddings
+
padding
assert
len
(
ret
)
==
self
.
num_embeddings_padded
assert
len
(
ret
)
==
self
.
num_embeddings_padded
return
ret
return
ret
...
@@ -385,10 +435,14 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -385,10 +435,14 @@ class VocabParallelEmbedding(torch.nn.Module):
# If param packed on the same dim we are sharding on, then
# If param packed on the same dim we are sharding on, then
# need to adjust offsets of loaded weight by pack_factor.
# need to adjust offsets of loaded weight by pack_factor.
if
packed_dim
is
not
None
and
packed_dim
==
output_dim
:
if
packed_dim
is
not
None
and
packed_dim
==
output_dim
:
packed_factor
=
param
.
packed_factor
if
isinstance
(
packed_factor
=
(
param
,
BasevLLMParameter
)
else
param
.
pack_factor
param
.
packed_factor
assert
loaded_weight
.
shape
[
output_dim
]
==
(
self
.
org_vocab_size
//
if
isinstance
(
param
,
BasevLLMParameter
)
param
.
packed_factor
)
else
param
.
pack_factor
)
assert
loaded_weight
.
shape
[
output_dim
]
==
(
self
.
org_vocab_size
//
param
.
packed_factor
)
start_idx
=
start_idx
//
packed_factor
start_idx
=
start_idx
//
packed_factor
shard_size
=
shard_size
//
packed_factor
shard_size
=
shard_size
//
packed_factor
else
:
else
:
...
@@ -396,23 +450,24 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -396,23 +450,24 @@ class VocabParallelEmbedding(torch.nn.Module):
# Copy the data.
# Copy the data.
loaded_weight
=
loaded_weight
.
narrow
(
output_dim
,
start_idx
,
shard_size
)
loaded_weight
=
loaded_weight
.
narrow
(
output_dim
,
start_idx
,
shard_size
)
param
[:
loaded_weight
.
shape
[
0
]].
data
.
copy_
(
loaded_weight
)
param
[:
loaded_weight
.
shape
[
0
]].
data
.
copy_
(
loaded_weight
)
param
[
loaded_weight
.
shape
[
0
]:].
data
.
fill_
(
0
)
param
[
loaded_weight
.
shape
[
0
]
:].
data
.
fill_
(
0
)
def
forward
(
self
,
input_
):
def
forward
(
self
,
input_
):
if
self
.
tp_size
>
1
:
if
self
.
tp_size
>
1
:
# Build the mask.
# Build the mask.
masked_input
,
input_mask
=
get_masked_input_and_mask
(
masked_input
,
input_mask
=
get_masked_input_and_mask
(
input_
,
self
.
shard_indices
.
org_vocab_start_index
,
input_
,
self
.
shard_indices
.
org_vocab_start_index
,
self
.
shard_indices
.
org_vocab_end_index
,
self
.
shard_indices
.
org_vocab_end_index
,
self
.
shard_indices
.
num_org_vocab_padding
,
self
.
shard_indices
.
num_org_vocab_padding
,
self
.
shard_indices
.
added_vocab_start_index
,
self
.
shard_indices
.
added_vocab_start_index
,
self
.
shard_indices
.
added_vocab_end_index
)
self
.
shard_indices
.
added_vocab_end_index
,
)
else
:
else
:
masked_input
=
input_
masked_input
=
input_
# Get the embeddings.
# Get the embeddings.
output_parallel
=
self
.
linear_method
.
embedding
(
self
,
output_parallel
=
self
.
linear_method
.
embedding
(
self
,
masked_input
.
long
())
masked_input
.
long
())
# Mask the output embedding.
# Mask the output embedding.
if
self
.
tp_size
>
1
:
if
self
.
tp_size
>
1
:
output_parallel
.
masked_fill_
(
input_mask
.
unsqueeze
(
-
1
),
0
)
output_parallel
.
masked_fill_
(
input_mask
.
unsqueeze
(
-
1
),
0
)
...
@@ -426,9 +481,9 @@ class VocabParallelEmbedding(torch.nn.Module):
...
@@ -426,9 +481,9 @@ class VocabParallelEmbedding(torch.nn.Module):
s
=
f
"num_embeddings=
{
self
.
num_embeddings_per_partition
}
"
s
=
f
"num_embeddings=
{
self
.
num_embeddings_per_partition
}
"
s
+=
f
", embedding_dim=
{
self
.
embedding_dim
}
"
s
+=
f
", embedding_dim=
{
self
.
embedding_dim
}
"
s
+=
f
", org_vocab_size=
{
self
.
org_vocab_size
}
"
s
+=
f
", org_vocab_size=
{
self
.
org_vocab_size
}
"
s
+=
f
'
, num_embeddings_padded=
{
self
.
num_embeddings_padded
}
'
s
+=
f
"
, num_embeddings_padded=
{
self
.
num_embeddings_padded
}
"
if
self
.
enable_tp
:
if
self
.
enable_tp
:
s
+=
f
'
, tp_size=
{
self
.
tp_size
}
'
s
+=
f
"
, tp_size=
{
self
.
tp_size
}
"
return
s
return
s
...
@@ -448,7 +503,8 @@ class ParallelLMHead(VocabParallelEmbedding):
...
@@ -448,7 +503,8 @@ class ParallelLMHead(VocabParallelEmbedding):
padding_size: padding size for the vocabulary.
padding_size: padding size for the vocabulary.
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
num_embeddings
:
int
,
num_embeddings
:
int
,
embedding_dim
:
int
,
embedding_dim
:
int
,
bias
:
bool
=
False
,
bias
:
bool
=
False
,
...
@@ -456,19 +512,29 @@ class ParallelLMHead(VocabParallelEmbedding):
...
@@ -456,19 +512,29 @@ class ParallelLMHead(VocabParallelEmbedding):
org_num_embeddings
:
Optional
[
int
]
=
None
,
org_num_embeddings
:
Optional
[
int
]
=
None
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
):
prefix
:
str
=
""
,
super
().
__init__
(
num_embeddings
,
embedding_dim
,
params_dtype
,
):
org_num_embeddings
,
padding_size
,
quant_config
,
super
().
__init__
(
prefix
)
num_embeddings
,
embedding_dim
,
params_dtype
,
org_num_embeddings
,
padding_size
,
quant_config
,
prefix
,
)
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
if
bias
:
if
bias
:
self
.
bias
=
Parameter
(
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
torch
.
empty
(
self
.
num_embeddings_per_partition
,
dtype
=
params_dtype
)
dtype
=
params_dtype
))
)
set_weight_attrs
(
self
.
bias
,
{
set_weight_attrs
(
self
.
bias
,
{
"output_dim"
:
0
,
"output_dim"
:
0
,
"weight_loader"
:
self
.
weight_loader
,
"weight_loader"
:
self
.
weight_loader
,
})
},
)
else
:
else
:
self
.
register_parameter
(
"bias"
,
None
)
self
.
register_parameter
(
"bias"
,
None
)
...
...
python/sglang/srt/managers/io_struct.py
View file @
c77c1e05
...
@@ -86,8 +86,10 @@ class GenerateReqInput:
...
@@ -86,8 +86,10 @@ class GenerateReqInput:
self
.
parallel_sample_num
=
self
.
sampling_params
.
get
(
"n"
,
1
)
self
.
parallel_sample_num
=
self
.
sampling_params
.
get
(
"n"
,
1
)
else
:
# isinstance(self.sampling_params, list):
else
:
# isinstance(self.sampling_params, list):
self
.
parallel_sample_num
=
self
.
sampling_params
[
0
].
get
(
"n"
,
1
)
self
.
parallel_sample_num
=
self
.
sampling_params
[
0
].
get
(
"n"
,
1
)
assert
all
(
self
.
parallel_sample_num
==
sampling_params
.
get
(
"n"
,
1
)
for
sampling_params
in
self
.
sampling_params
),
(
assert
all
(
"The parallel_sample_num should be the same for all samples in sample params."
)
self
.
parallel_sample_num
==
sampling_params
.
get
(
"n"
,
1
)
for
sampling_params
in
self
.
sampling_params
),
"The parallel_sample_num should be the same for all samples in sample params."
if
self
.
parallel_sample_num
>
1
and
self
.
is_single
:
if
self
.
parallel_sample_num
>
1
and
self
.
is_single
:
self
.
is_single
=
False
self
.
is_single
=
False
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
c77c1e05
...
@@ -911,8 +911,7 @@ class ScheduleBatch:
...
@@ -911,8 +911,7 @@ class ScheduleBatch:
keep_indices
=
[
keep_indices
=
[
i
i
for
i
in
range
(
len
(
self
.
reqs
))
for
i
in
range
(
len
(
self
.
reqs
))
if
not
self
.
reqs
[
i
].
finished
()
if
not
self
.
reqs
[
i
].
finished
()
and
self
.
reqs
[
i
]
is
not
being_chunked_req
and
self
.
reqs
[
i
]
is
not
being_chunked_req
]
]
if
keep_indices
is
None
or
len
(
keep_indices
)
==
0
:
if
keep_indices
is
None
or
len
(
keep_indices
)
==
0
:
...
@@ -1043,6 +1042,7 @@ class ScheduleBatch:
...
@@ -1043,6 +1042,7 @@ class ScheduleBatch:
for
req
in
self
.
reqs
:
for
req
in
self
.
reqs
:
req
.
started_time
=
time
.
time
()
req
.
started_time
=
time
.
time
()
@
dataclasses
.
dataclass
@
dataclasses
.
dataclass
class
ModelWorkerBatch
:
class
ModelWorkerBatch
:
# The batch id
# The batch id
...
...
python/sglang/srt/managers/scheduler.py
View file @
c77c1e05
...
@@ -566,9 +566,7 @@ class Scheduler:
...
@@ -566,9 +566,7 @@ class Scheduler:
and
not
self
.
last_batch
.
is_empty
()
and
not
self
.
last_batch
.
is_empty
()
):
):
if
self
.
being_chunked_req
:
if
self
.
being_chunked_req
:
self
.
last_batch
.
filter_batch
(
self
.
last_batch
.
filter_batch
(
being_chunked_req
=
self
.
being_chunked_req
)
being_chunked_req
=
self
.
being_chunked_req
)
self
.
tree_cache
.
cache_unfinished_req
(
self
.
being_chunked_req
)
self
.
tree_cache
.
cache_unfinished_req
(
self
.
being_chunked_req
)
# Inflight request keeps its rid but will get a new req_pool_idx.
# Inflight request keeps its rid but will get a new req_pool_idx.
self
.
req_to_token_pool
.
free
(
self
.
being_chunked_req
.
req_pool_idx
)
self
.
req_to_token_pool
.
free
(
self
.
being_chunked_req
.
req_pool_idx
)
...
@@ -628,9 +626,7 @@ class Scheduler:
...
@@ -628,9 +626,7 @@ class Scheduler:
has_inflight
=
self
.
being_chunked_req
is
not
None
has_inflight
=
self
.
being_chunked_req
is
not
None
if
has_inflight
:
if
has_inflight
:
self
.
being_chunked_req
.
init_next_round_input
()
self
.
being_chunked_req
.
init_next_round_input
()
self
.
being_chunked_req
=
adder
.
add_inflight_req
(
self
.
being_chunked_req
=
adder
.
add_inflight_req
(
self
.
being_chunked_req
)
self
.
being_chunked_req
)
if
self
.
lora_paths
:
if
self
.
lora_paths
:
lora_set
=
(
lora_set
=
(
...
@@ -813,7 +809,8 @@ class Scheduler:
...
@@ -813,7 +809,8 @@ class Scheduler:
embeddings
=
self
.
tp_worker
.
forward_batch_embedding
(
model_worker_batch
)
embeddings
=
self
.
tp_worker
.
forward_batch_embedding
(
model_worker_batch
)
ret
=
embeddings
,
model_worker_batch
.
bid
ret
=
embeddings
,
model_worker_batch
.
bid
return
ret
return
ret
def
get_stats
(
self
,
batch
:
ScheduleBatch
):
def
get_stats
(
self
,
batch
:
ScheduleBatch
):
# TODO: get stats for chunked prefill
# TODO: get stats for chunked prefill
now
=
time
.
time
()
now
=
time
.
time
()
...
@@ -829,8 +826,8 @@ class Scheduler:
...
@@ -829,8 +826,8 @@ class Scheduler:
# set stats from prefill
# set stats from prefill
if
self
.
stats
is
not
None
:
if
self
.
stats
is
not
None
:
# new_seq=self.stats.new_seq
# new_seq=self.stats.new_seq
cache_hit_rate
=
self
.
stats
.
cache_hit_rate
cache_hit_rate
=
self
.
stats
.
cache_hit_rate
token_usage
=
self
.
stats
.
token_usage
token_usage
=
self
.
stats
.
token_usage
# Iteration stats
# Iteration stats
num_prompt_tokens_iter
=
0
num_prompt_tokens_iter
=
0
num_generation_tokens_iter
=
0
num_generation_tokens_iter
=
0
...
@@ -851,15 +848,19 @@ class Scheduler:
...
@@ -851,15 +848,19 @@ class Scheduler:
# _, next_token_ids, _ = result
# _, next_token_ids, _ = result
if
batch
is
not
None
:
if
batch
is
not
None
:
num_generation_tokens_iter
=
len
(
batch
.
output_ids
)
num_generation_tokens_iter
=
len
(
batch
.
output_ids
)
gen_throughput
=
round
(
num_generation_tokens_iter
/
(
now
-
self
.
last_stats_tic
),
2
)
gen_throughput
=
round
(
num_generation_tokens_iter
/
(
now
-
self
.
last_stats_tic
),
2
)
for
i
,
req
in
enumerate
(
batch
.
reqs
):
for
i
,
req
in
enumerate
(
batch
.
reqs
):
# NOTE: Batch forward mode is extend befor start decode,
# NOTE: Batch forward mode is extend befor start decode,
if
batch
.
forward_mode
.
is_extend
():
if
batch
.
forward_mode
.
is_extend
():
num_prompt_tokens_iter
=
len
(
batch
.
input_ids
)
+
sum
(
batch
.
prefix_lens
)
num_prompt_tokens_iter
=
len
(
batch
.
input_ids
)
+
sum
(
batch
.
prefix_lens
)
time_to_first_tokens_iter
.
append
(
now
-
req
.
started_time
)
time_to_first_tokens_iter
.
append
(
now
-
req
.
started_time
)
else
:
else
:
time_per_output_tokens_iter
.
append
(
now
-
self
.
last_stats_tic
)
time_per_output_tokens_iter
.
append
(
now
-
self
.
last_stats_tic
)
if
req
.
finished
():
if
req
.
finished
():
time_e2e_requests
.
append
(
now
-
req
.
created_time
)
time_e2e_requests
.
append
(
now
-
req
.
created_time
)
...
@@ -869,7 +870,8 @@ class Scheduler:
...
@@ -869,7 +870,8 @@ class Scheduler:
finished_reason_requests
.
append
(
finished_reason_requests
.
append
(
req
.
finished_reason
.
to_json
()
req
.
finished_reason
.
to_json
()
if
req
.
finished_reason
is
not
None
if
req
.
finished_reason
is
not
None
else
None
)
else
None
)
return
Stats
(
return
Stats
(
new_seq
=
new_seq
,
new_seq
=
new_seq
,
...
@@ -893,7 +895,7 @@ class Scheduler:
...
@@ -893,7 +895,7 @@ class Scheduler:
max_running_requests
=
self
.
max_running_requests
,
max_running_requests
=
self
.
max_running_requests
,
)
)
def
log_stats
(
self
,
stats
:
Stats
):
def
log_stats
(
self
,
stats
:
Stats
):
self
.
metrics_collector
.
log_stats
(
stats
)
self
.
metrics_collector
.
log_stats
(
stats
)
def
process_batch_result
(
self
,
batch
:
ScheduleBatch
,
result
):
def
process_batch_result
(
self
,
batch
:
ScheduleBatch
,
result
):
...
@@ -1003,9 +1005,7 @@ class Scheduler:
...
@@ -1003,9 +1005,7 @@ class Scheduler:
if
req
.
is_retracted
:
if
req
.
is_retracted
:
continue
continue
if
self
.
server_args
.
enable_overlap_schedule
and
(
if
self
.
server_args
.
enable_overlap_schedule
and
(
req
.
finished
()):
req
.
finished
()
):
self
.
token_to_kv_pool
.
free
(
batch
.
out_cache_loc
[
i
:
i
+
1
])
self
.
token_to_kv_pool
.
free
(
batch
.
out_cache_loc
[
i
:
i
+
1
])
continue
continue
...
@@ -1031,7 +1031,10 @@ class Scheduler:
...
@@ -1031,7 +1031,10 @@ class Scheduler:
self
.
token_to_kv_pool
.
free_group_end
()
self
.
token_to_kv_pool
.
free_group_end
()
self
.
forward_ct_decode
=
(
self
.
forward_ct_decode
+
1
)
%
(
1
<<
30
)
self
.
forward_ct_decode
=
(
self
.
forward_ct_decode
+
1
)
%
(
1
<<
30
)
if
self
.
tp_rank
==
0
and
self
.
forward_ct_decode
%
self
.
server_args
.
decode_log_interval
==
0
:
if
(
self
.
tp_rank
==
0
and
self
.
forward_ct_decode
%
self
.
server_args
.
decode_log_interval
==
0
):
self
.
print_decode_stats
()
self
.
print_decode_stats
()
def
add_logprob_return_values
(
def
add_logprob_return_values
(
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
c77c1e05
...
@@ -215,7 +215,7 @@ class TokenizerManager:
...
@@ -215,7 +215,7 @@ class TokenizerManager:
logprob_start_len
,
logprob_start_len
,
top_logprobs_num
,
top_logprobs_num
,
obj
.
stream
,
obj
.
stream
,
obj
.
lora_path
obj
.
lora_path
,
)
)
elif
isinstance
(
obj
,
EmbeddingReqInput
):
elif
isinstance
(
obj
,
EmbeddingReqInput
):
tokenized_obj
=
TokenizedEmbeddingReqInput
(
tokenized_obj
=
TokenizedEmbeddingReqInput
(
...
@@ -290,7 +290,9 @@ class TokenizerManager:
...
@@ -290,7 +290,9 @@ class TokenizerManager:
# Tokenize all requests
# Tokenize all requests
objs
=
[
obj
[
i
]
for
i
in
range
(
batch_size
)]
objs
=
[
obj
[
i
]
for
i
in
range
(
batch_size
)]
tokenized_objs
=
await
asyncio
.
gather
(
*
(
self
.
_tokenize_one_request
(
obj
)
for
obj
in
objs
))
tokenized_objs
=
await
asyncio
.
gather
(
*
(
self
.
_tokenize_one_request
(
obj
)
for
obj
in
objs
)
)
# Cache the common prefix for parallel sampling
# Cache the common prefix for parallel sampling
for
i
in
range
(
batch_size
):
for
i
in
range
(
batch_size
):
...
@@ -322,7 +324,9 @@ class TokenizerManager:
...
@@ -322,7 +324,9 @@ class TokenizerManager:
rid_to_index
=
{
rid
:
i
for
i
,
rid
in
enumerate
(
rids
)}
rid_to_index
=
{
rid
:
i
for
i
,
rid
in
enumerate
(
rids
)}
task_map
=
{
asyncio
.
create_task
(
gen
.
__anext__
()):
gen
for
gen
in
generators
}
task_map
=
{
asyncio
.
create_task
(
gen
.
__anext__
()):
gen
for
gen
in
generators
}
while
task_map
:
while
task_map
:
done
,
_
=
await
asyncio
.
wait
(
task_map
.
keys
(),
return_when
=
asyncio
.
FIRST_COMPLETED
)
done
,
_
=
await
asyncio
.
wait
(
task_map
.
keys
(),
return_when
=
asyncio
.
FIRST_COMPLETED
)
for
task
in
done
:
for
task
in
done
:
gen
=
task_map
.
pop
(
task
)
gen
=
task_map
.
pop
(
task
)
...
...
python/sglang/srt/metrics/metrics_collector.py
View file @
c77c1e05
...
@@ -130,27 +130,65 @@ class Metrics:
...
@@ -130,27 +130,65 @@ class Metrics:
self
.
counter_prompt_tokens
=
Counter
(
self
.
counter_prompt_tokens
=
Counter
(
name
=
"sglang:prompt_tokens_total"
,
name
=
"sglang:prompt_tokens_total"
,
documentation
=
"Number of prefill tokens processed."
,
documentation
=
"Number of prefill tokens processed."
,
labelnames
=
labelnames
)
labelnames
=
labelnames
,
)
self
.
counter_generation_tokens
=
Counter
(
self
.
counter_generation_tokens
=
Counter
(
name
=
"sglang:generation_tokens_total"
,
name
=
"sglang:generation_tokens_total"
,
documentation
=
"Number of generation tokens processed."
,
documentation
=
"Number of generation tokens processed."
,
labelnames
=
labelnames
)
labelnames
=
labelnames
,
)
self
.
histogram_time_to_first_token
=
Histogram
(
self
.
histogram_time_to_first_token
=
Histogram
(
name
=
"sglang:time_to_first_token_seconds"
,
name
=
"sglang:time_to_first_token_seconds"
,
documentation
=
"Histogram of time to first token in seconds."
,
documentation
=
"Histogram of time to first token in seconds."
,
labelnames
=
labelnames
,
labelnames
=
labelnames
,
buckets
=
[
buckets
=
[
0.001
,
0.005
,
0.01
,
0.02
,
0.04
,
0.06
,
0.08
,
0.1
,
0.25
,
0.5
,
0.001
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
15.0
,
20.0
,
25.0
,
30.0
0.005
,
])
0.01
,
0.02
,
0.04
,
0.06
,
0.08
,
0.1
,
0.25
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
15.0
,
20.0
,
25.0
,
30.0
,
],
)
self
.
histogram_time_per_output_token
=
Histogram
(
self
.
histogram_time_per_output_token
=
Histogram
(
name
=
"sglang:time_per_output_token_seconds"
,
name
=
"sglang:time_per_output_token_seconds"
,
documentation
=
"Histogram of time per output token in seconds."
,
documentation
=
"Histogram of time per output token in seconds."
,
labelnames
=
labelnames
,
labelnames
=
labelnames
,
buckets
=
[
buckets
=
[
0.005
,
0.01
,
0.015
,
0.02
,
0.025
,
0.03
,
0.04
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
0.005
,
1.0
,
2.5
0.01
,
])
0.015
,
0.02
,
0.025
,
0.03
,
0.04
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
,
],
)
# Request Stats
# Request Stats
# Metadata
# Metadata
...
@@ -245,14 +283,19 @@ class PrometheusMetricsCollector(MetricsCollector):
...
@@ -245,14 +283,19 @@ class PrometheusMetricsCollector(MetricsCollector):
stats
.
num_generation_tokens_requests
,
stats
.
num_generation_tokens_requests
,
)
)
self
.
_log_counter
(
self
.
metrics
.
counter_prompt_tokens
,
self
.
_log_counter
(
stats
.
num_prompt_tokens_iter
)
self
.
metrics
.
counter_prompt_tokens
,
stats
.
num_prompt_tokens_iter
self
.
_log_counter
(
self
.
metrics
.
counter_generation_tokens
,
)
stats
.
num_generation_tokens_iter
)
self
.
_log_counter
(
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_to_first_token
,
self
.
metrics
.
counter_generation_tokens
,
stats
.
num_generation_tokens_iter
stats
.
time_to_first_tokens_iter
)
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_per_output_token
,
self
.
_log_histogram
(
stats
.
time_per_output_tokens_iter
)
self
.
metrics
.
histogram_time_to_first_token
,
stats
.
time_to_first_tokens_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_per_output_token
,
stats
.
time_per_output_tokens_iter
,
)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
self
.
_log_gauge
(
self
.
metrics
.
num_running_sys
,
stats
.
num_running_req
)
self
.
_log_gauge
(
self
.
metrics
.
num_running_sys
,
stats
.
num_running_req
)
...
...
python/sglang/srt/models/gpt2.py
View file @
c77c1e05
...
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.activation import get_act_fn
...
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
#from sglang.srt.layers.activation import get_act_fn
#
from sglang.srt.layers.activation import get_act_fn
from
sglang.srt.layers.linear
import
(
from
sglang.srt.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -47,15 +47,14 @@ class GPT2Attention(nn.Module):
...
@@ -47,15 +47,14 @@ class GPT2Attention(nn.Module):
self
,
self
,
layer_id
:
int
,
layer_id
:
int
,
config
:
GPT2Config
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
):
):
super
().
__init__
()
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
total_num_heads
=
config
.
num_attention_heads
total_num_heads
=
config
.
num_attention_heads
tensor_model_parallel_world_size
=
(
tensor_model_parallel_world_size
=
get_tensor_model_parallel_world_size
()
get_tensor_model_parallel_world_size
())
assert
total_num_heads
%
tensor_model_parallel_world_size
==
0
assert
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
self
.
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
self
.
head_dim
=
self
.
hidden_size
//
total_num_heads
self
.
head_dim
=
self
.
hidden_size
//
total_num_heads
...
@@ -76,11 +75,13 @@ class GPT2Attention(nn.Module):
...
@@ -76,11 +75,13 @@ class GPT2Attention(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
)
)
self
.
attn
=
RadixAttention
(
self
.
num_heads
,
self
.
attn
=
RadixAttention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
head_dim
,
scaling
=
self
.
scale
,
scaling
=
self
.
scale
,
num_kv_heads
=
total_num_heads
,
num_kv_heads
=
total_num_heads
,
layer_id
=
layer_id
)
layer_id
=
layer_id
,
)
def
forward
(
def
forward
(
self
,
self
,
...
@@ -119,10 +120,14 @@ class GPT2MLP(nn.Module):
...
@@ -119,10 +120,14 @@ class GPT2MLP(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
)
)
self
.
act
=
get_act_fn
(
config
.
activation_function
,
quant_config
,
self
.
act
=
get_act_fn
(
intermediate_size
)
config
.
activation_function
,
quant_config
,
intermediate_size
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
c_fc
(
hidden_states
)
hidden_states
,
_
=
self
.
c_fc
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
,
_
=
self
.
c_proj
(
hidden_states
)
hidden_states
,
_
=
self
.
c_proj
(
hidden_states
)
...
@@ -135,27 +140,20 @@ class GPT2Block(nn.Module):
...
@@ -135,27 +140,20 @@ class GPT2Block(nn.Module):
self
,
self
,
layer_id
:
int
,
layer_id
:
int
,
config
:
GPT2Config
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
):
):
super
().
__init__
()
super
().
__init__
()
hidden_size
=
config
.
hidden_size
hidden_size
=
config
.
hidden_size
inner_dim
=
(
config
.
n_inner
if
config
.
n_inner
is
not
None
else
4
*
inner_dim
=
config
.
n_inner
if
config
.
n_inner
is
not
None
else
4
*
hidden_size
hidden_size
)
self
.
ln_1
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_1
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
GPT2Attention
(
layer_id
,
self
.
attn
=
GPT2Attention
(
config
,
layer_id
,
config
,
cache_config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
cache_config
,
)
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
)
self
.
ln_2
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_2
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
GPT2MLP
(
inner_dim
,
self
.
mlp
=
GPT2MLP
(
inner_dim
,
config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
)
config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
)
def
forward
(
def
forward
(
self
,
self
,
...
@@ -179,13 +177,12 @@ class GPT2Block(nn.Module):
...
@@ -179,13 +177,12 @@ class GPT2Block(nn.Module):
return
hidden_states
return
hidden_states
class
GPT2Model
(
nn
.
Module
):
class
GPT2Model
(
nn
.
Module
):
def
__init__
(
def
__init__
(
self
,
self
,
config
:
GPT2Config
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
):
):
...
@@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module):
...
@@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module):
def
__init__
(
def
__init__
(
self
,
self
,
config
:
GPT2Config
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
):
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
transformer
=
GPT2Model
(
config
,
self
.
transformer
=
GPT2Model
(
cache_config
,
config
,
cache_config
,
quant_config
,
prefix
=
"transformer"
quant_config
,
)
prefix
=
"transformer"
)
self
.
lm_head
=
self
.
transformer
.
wte
self
.
lm_head
=
self
.
transformer
.
wte
self
.
logits_processor
=
LogitsProcessor
(
config
)
self
.
logits_processor
=
LogitsProcessor
(
config
)
...
@@ -254,8 +250,6 @@ class GPT2LMHeadModel(nn.Module):
...
@@ -254,8 +250,6 @@ class GPT2LMHeadModel(nn.Module):
input_ids
,
hidden_states
,
self
.
lm_head
.
weight
,
forward_batch
input_ids
,
hidden_states
,
self
.
lm_head
.
weight
,
forward_batch
)
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
params_dict
=
dict
(
self
.
named_parameters
(
remove_duplicate
=
False
))
params_dict
=
dict
(
self
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
weights
:
for
name
,
loaded_weight
in
weights
:
...
@@ -280,8 +274,8 @@ class GPT2LMHeadModel(nn.Module):
...
@@ -280,8 +274,8 @@ class GPT2LMHeadModel(nn.Module):
if
not
name
.
endswith
(
".weight"
):
if
not
name
.
endswith
(
".weight"
):
continue
continue
loaded_weight
=
loaded_weight
.
t
()
loaded_weight
=
loaded_weight
.
t
()
weight_loader
=
getattr
(
param
,
"weight_loader"
,
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
weight_loader
(
param
,
loaded_weight
)
EntryClass
=
GPT2LMHeadModel
EntryClass
=
GPT2LMHeadModel
python/sglang/srt/server.py
View file @
c77c1e05
...
@@ -419,6 +419,7 @@ def launch_engine(
...
@@ -419,6 +419,7 @@ def launch_engine(
for
i
in
range
(
len
(
scheduler_pipe_readers
)):
for
i
in
range
(
len
(
scheduler_pipe_readers
)):
scheduler_pipe_readers
[
i
].
recv
()
scheduler_pipe_readers
[
i
].
recv
()
def
add_prometheus_middleware
(
app
:
FastAPI
):
def
add_prometheus_middleware
(
app
:
FastAPI
):
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216
from
prometheus_client
import
CollectorRegistry
,
make_asgi_app
,
multiprocess
from
prometheus_client
import
CollectorRegistry
,
make_asgi_app
,
multiprocess
...
@@ -490,6 +491,7 @@ def launch_server(
...
@@ -490,6 +491,7 @@ def launch_server(
finally
:
finally
:
t
.
join
()
t
.
join
()
def
_set_prometheus_env
():
def
_set_prometheus_env
():
# Set prometheus multiprocess directory
# Set prometheus multiprocess directory
# sglang uses prometheus multiprocess mode
# sglang uses prometheus multiprocess mode
...
@@ -506,6 +508,7 @@ def _set_prometheus_env():
...
@@ -506,6 +508,7 @@ def _set_prometheus_env():
os
.
environ
[
"PROMETHEUS_MULTIPROC_DIR"
]
=
prometheus_multiproc_dir
.
name
os
.
environ
[
"PROMETHEUS_MULTIPROC_DIR"
]
=
prometheus_multiproc_dir
.
name
logger
.
debug
(
f
"PROMETHEUS_MULTIPROC_DIR:
{
os
.
environ
[
'PROMETHEUS_MULTIPROC_DIR'
]
}
"
)
logger
.
debug
(
f
"PROMETHEUS_MULTIPROC_DIR:
{
os
.
environ
[
'PROMETHEUS_MULTIPROC_DIR'
]
}
"
)
def
_set_envs_and_config
(
server_args
:
ServerArgs
):
def
_set_envs_and_config
(
server_args
:
ServerArgs
):
# Set global environments
# Set global environments
os
.
environ
[
"TF_CPP_MIN_LOG_LEVEL"
]
=
"3"
os
.
environ
[
"TF_CPP_MIN_LOG_LEVEL"
]
=
"3"
...
@@ -763,8 +766,8 @@ class Engine:
...
@@ -763,8 +766,8 @@ class Engine:
# runtime server default log level is log
# runtime server default log level is log
# offline engine works in scripts, so we set it to error
# offline engine works in scripts, so we set it to error
if
'
log_level
'
not
in
kwargs
:
if
"
log_level
"
not
in
kwargs
:
kwargs
[
'
log_level
'
]
=
'
error
'
kwargs
[
"
log_level
"
]
=
"
error
"
server_args
=
ServerArgs
(
*
args
,
**
kwargs
)
server_args
=
ServerArgs
(
*
args
,
**
kwargs
)
launch_engine
(
server_args
=
server_args
)
launch_engine
(
server_args
=
server_args
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment