Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c77c1e05
Unverified
Commit
c77c1e05
authored
Nov 07, 2024
by
Chayenne
Committed by
GitHub
Nov 08, 2024
Browse files
fix black in pre-commit (#1940)
parent
dca87ec3
Changes
29
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
617 additions
and
494 deletions
+617
-494
.pre-commit-config.yaml
.pre-commit-config.yaml
+3
-3
docs/backend/native_api.ipynb
docs/backend/native_api.ipynb
+61
-64
docs/backend/offline_engine_api.ipynb
docs/backend/offline_engine_api.ipynb
+24
-24
docs/backend/openai_api_completions.ipynb
docs/backend/openai_api_completions.ipynb
+56
-53
docs/backend/openai_api_embeddings.ipynb
docs/backend/openai_api_embeddings.ipynb
+24
-24
docs/backend/openai_api_vision.ipynb
docs/backend/openai_api_vision.ipynb
+24
-24
docs/conf.py
docs/conf.py
+5
-3
docs/start/send_request.ipynb
docs/start/send_request.ipynb
+35
-37
examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
.../rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
+26
-19
examples/runtime/engine/input_ids.py
examples/runtime/engine/input_ids.py
+1
-0
python/sglang/srt/configs/model_config.py
python/sglang/srt/configs/model_config.py
+4
-2
python/sglang/srt/layers/quantization/base_config.py
python/sglang/srt/layers/quantization/base_config.py
+4
-6
python/sglang/srt/layers/vocab_parallel_embedding.py
python/sglang/srt/layers/vocab_parallel_embedding.py
+214
-148
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+4
-2
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+2
-2
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+25
-22
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+11
-7
python/sglang/srt/metrics/metrics_collector.py
python/sglang/srt/metrics/metrics_collector.py
+59
-16
python/sglang/srt/models/gpt2.py
python/sglang/srt/models/gpt2.py
+30
-36
python/sglang/srt/server.py
python/sglang/srt/server.py
+5
-2
No files found.
.pre-commit-config.yaml
View file @
c77c1e05
...
...
@@ -30,6 +30,6 @@ repos:
rev
:
24.10.0
hooks
:
-
id
:
black
additional_dependencies
:
[
'
.[jupyter]'
]
types
:
[
python
,
jupyter
]
types
_or
:
[
python
,
jupyter
]
types
:
[
python
]
-
id
:
black-
jupyter
types
:
[
jupyter
]
docs/backend/native_api.ipynb
View file @
c77c1e05
...
...
@@ -34,10 +34,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:08.536886
Z",
"iopub.status.busy": "2024-11-0
5T05:08:08.536763
Z",
"iopub.status.idle": "2024-11-0
5T05:08:34.725831
Z",
"shell.execute_reply": "2024-11-0
5T05:08:34.725316
Z"
"iopub.execute_input": "2024-11-0
7T18:44:42.063503
Z",
"iopub.status.busy": "2024-11-0
7T18:44:42.063379
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.255300
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.254547
Z"
}
},
"outputs": [],
...
...
@@ -73,10 +73,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:34.727530
Z",
"iopub.status.busy": "2024-11-0
5T05:08:34.727333
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.359784
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.359090
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.258292
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.257710
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.611559
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.610842
Z"
}
},
"outputs": [],
...
...
@@ -101,10 +101,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.362286
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.362140
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.368711
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.368220
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.613911
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.613746
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.620286
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.619779
Z"
}
},
"outputs": [],
...
...
@@ -132,10 +132,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.371313
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.37087
7Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.376712
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.376230
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.622407
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.62226
7Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.628290
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.627793
Z"
}
},
"outputs": [],
...
...
@@ -164,10 +164,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.378982
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.378597
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.391820
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.391336
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.630585
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.630235
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.643498
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.643007
Z"
}
},
"outputs": [],
...
...
@@ -183,10 +183,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.393748
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.39360
6Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.398645
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.398145
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.645336
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.64519
6Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.650363
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.649837
Z"
}
},
"outputs": [],
...
...
@@ -211,10 +211,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.400683
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.400419
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.406146
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.405661
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.652212
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.652076
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.658633
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.658119
Z"
}
},
"outputs": [],
...
...
@@ -241,10 +241,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.408176
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.407884
Z",
"iopub.status.idle": "2024-11-0
5T05:08:35.413587
Z",
"shell.execute_reply": "2024-11-0
5T05:08:35.413108
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.660468
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.660325
Z",
"iopub.status.idle": "2024-11-0
7T18:45:07.666476
Z",
"shell.execute_reply": "2024-11-0
7T18:45:07.665984
Z"
}
},
"outputs": [],
...
...
@@ -271,10 +271,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:35.416090
Z",
"iopub.status.busy": "2024-11-0
5T05:08:35.415793
Z",
"iopub.status.idle": "2024-11-0
5T05:08:36.55254
9Z",
"shell.execute_reply": "2024-11-0
5T05:08:36.551870
Z"
"iopub.execute_input": "2024-11-0
7T18:45:07.668242
Z",
"iopub.status.busy": "2024-11-0
7T18:45:07.668108
Z",
"iopub.status.idle": "2024-11-0
7T18:45:08.72570
9Z",
"shell.execute_reply": "2024-11-0
7T18:45:08.725021
Z"
}
},
"outputs": [],
...
...
@@ -296,10 +296,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:36.554823
Z",
"iopub.status.busy": "2024-11-0
5T05:08:36.554680
Z",
"iopub.status.idle": "2024-11-0
5T05:08:38.053945
Z",
"shell.execute_reply": "2024-11-0
5T05:08:38.053034
Z"
"iopub.execute_input": "2024-11-0
7T18:45:08.727865
Z",
"iopub.status.busy": "2024-11-0
7T18:45:08.727721
Z",
"iopub.status.idle": "2024-11-0
7T18:45:11.165841
Z",
"shell.execute_reply": "2024-11-0
7T18:45:11.165282
Z"
}
},
"outputs": [],
...
...
@@ -335,10 +335,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:08:38.05
6783Z",
"iopub.status.busy": "2024-11-0
5T05:08:38.056497
Z",
"iopub.status.idle": "2024-11-0
5T05:09:04.436030
Z",
"shell.execute_reply": "2024-11-0
5T05:09:04.435311
Z"
"iopub.execute_input": "2024-11-0
7T18:45:11.1
678
5
3Z",
"iopub.status.busy": "2024-11-0
7T18:45:11.167711
Z",
"iopub.status.idle": "2024-11-0
7T18:45:39.542988
Z",
"shell.execute_reply": "2024-11-0
7T18:45:39.542135
Z"
}
},
"outputs": [],
...
...
@@ -360,10 +360,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:04.438987
Z",
"iopub.status.busy": "2024-11-0
5T05:09:04.438568
Z",
"iopub.status.idle": "2024-11-0
5T05:09:04.485291
Z",
"shell.execute_reply": "2024-11-0
5T05:09:04.484829
Z"
"iopub.execute_input": "2024-11-0
7T18:45:39.545416
Z",
"iopub.status.busy": "2024-11-0
7T18:45:39.545005
Z",
"iopub.status.idle": "2024-11-0
7T18:45:39.588793
Z",
"shell.execute_reply": "2024-11-0
7T18:45:39.588054
Z"
}
},
"outputs": [],
...
...
@@ -392,10 +392,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:04.487191
Z",
"iopub.status.busy": "2024-11-0
5T05:09:04.486929
Z",
"iopub.status.idle": "2024-11-0
5T05:09:25.553481
Z",
"shell.execute_reply": "2024-11-0
5T05:09:25.552747
Z"
"iopub.execute_input": "2024-11-0
7T18:45:39.590729
Z",
"iopub.status.busy": "2024-11-0
7T18:45:39.590446
Z",
"iopub.status.idle": "2024-11-0
7T18:45:59.660376
Z",
"shell.execute_reply": "2024-11-0
7T18:45:59.659992
Z"
}
},
"outputs": [],
...
...
@@ -419,10 +419,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:25.555813
Z",
"iopub.status.busy": "2024-11-0
5T05:09:25.555666
Z",
"iopub.status.idle": "2024-11-0
5T05:09:26.3543
72Z",
"shell.execute_reply": "2024-11-0
5T05:09:26.353
69
3
Z"
"iopub.execute_input": "2024-11-0
7T18:45:59.661779
Z",
"iopub.status.busy": "2024-11-0
7T18:45:59.661641
Z",
"iopub.status.idle": "2024-11-0
7T18:46:00.475
72
6
Z",
"shell.execute_reply": "2024-11-0
7T18:46:00.4752
69Z"
}
},
"outputs": [],
...
...
@@ -445,10 +445,7 @@
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
"\n",
"url = \"http://localhost:30030/classify\"\n",
"data = {\n",
" \"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \n",
" \"text\": prompts\n",
"}\n",
"data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
"\n",
"responses = requests.post(url, json=data).json()\n",
"for response in responses:\n",
...
...
@@ -460,10 +457,10 @@
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:26.356532
Z",
"iopub.status.busy": "2024-11-0
5T05:09:26.356327
Z",
"iopub.status.idle": "2024-11-0
5T05:09:26.396590
Z",
"shell.execute_reply": "2024-11-0
5T05:09:26.395914
Z"
"iopub.execute_input": "2024-11-0
7T18:46:00.477283
Z",
"iopub.status.busy": "2024-11-0
7T18:46:00.477025
Z",
"iopub.status.idle": "2024-11-0
7T18:46:00.525758
Z",
"shell.execute_reply": "2024-11-0
7T18:46:00.525236
Z"
}
},
"outputs": [],
...
...
docs/backend/offline_engine_api.ipynb
View file @
c77c1e05
...
...
@@ -35,10 +35,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:21:27.50302
6Z",
"iopub.status.busy": "2024-11-0
5T05:21:27.5027
41Z",
"iopub.status.idle": "2024-11-0
5T05:21:49.554631
Z",
"shell.execute_reply": "2024-11-0
5T05:21:49.55369
0Z"
"iopub.execute_input": "2024-11-0
7T18:46:04.78953
6Z",
"iopub.status.busy": "2024-11-0
7T18:46:04.789
41
8
Z",
"iopub.status.idle": "2024-11-0
7T18:46:27.038169
Z",
"shell.execute_reply": "2024-11-0
7T18:46:27.03754
0Z"
}
},
"outputs": [],
...
...
@@ -64,10 +64,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:21:49.55827
5Z",
"iopub.status.busy": "2024-11-0
5T05:21:49.558110
Z",
"iopub.status.idle": "2024-11-0
5T05:21:52.717287
Z",
"shell.execute_reply": "2024-11-0
5T05:21:52.716842
Z"
"iopub.execute_input": "2024-11-0
7T18:46:27.04000
5Z",
"iopub.status.busy": "2024-11-0
7T18:46:27.039872
Z",
"iopub.status.idle": "2024-11-0
7T18:46:30.203840
Z",
"shell.execute_reply": "2024-11-0
7T18:46:30.203368
Z"
}
},
"outputs": [],
...
...
@@ -99,10 +99,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:21:52.721738
Z",
"iopub.status.busy": "2024-11-0
5T05:21:52.720908
Z",
"iopub.status.idle": "2024-11-0
5T05:22:01.77034
1Z",
"shell.execute_reply": "2024-11-0
5T05:22:01.76951
0Z"
"iopub.execute_input": "2024-11-0
7T18:46:30.205880
Z",
"iopub.status.busy": "2024-11-0
7T18:46:30.205719
Z",
"iopub.status.idle": "2024-11-0
7T18:46:39.25656
1Z",
"shell.execute_reply": "2024-11-0
7T18:46:39.25588
0Z"
}
},
"outputs": [],
...
...
@@ -137,10 +137,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:01.772662
Z",
"iopub.status.busy": "2024-11-0
5T05:22:01.772377
Z",
"iopub.status.idle": "2024-11-0
5T05:22:04.897499
Z",
"shell.execute_reply": "2024-11-0
5T05:22:04.896867
Z"
"iopub.execute_input": "2024-11-0
7T18:46:39.259464
Z",
"iopub.status.busy": "2024-11-0
7T18:46:39.259309
Z",
"iopub.status.idle": "2024-11-0
7T18:46:42.384955
Z",
"shell.execute_reply": "2024-11-0
7T18:46:42.384378
Z"
}
},
"outputs": [],
...
...
@@ -179,10 +179,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:04.899754
Z",
"iopub.status.busy": "2024-11-0
5T05:22:04.899478
Z",
"iopub.status.idle": "2024-11-0
5T05:22:13.970245
Z",
"shell.execute_reply": "2024-11-0
5T05:22:13.969779
Z"
"iopub.execute_input": "2024-11-0
7T18:46:42.387431
Z",
"iopub.status.busy": "2024-11-0
7T18:46:42.387279
Z",
"iopub.status.idle": "2024-11-0
7T18:46:51.448572
Z",
"shell.execute_reply": "2024-11-0
7T18:46:51.447781
Z"
}
},
"outputs": [],
...
...
@@ -216,10 +216,10 @@
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:13.972039
Z",
"iopub.status.busy": "2024-11-0
5T05:22:13.971846
Z",
"iopub.status.idle": "2024-11-0
5T05:22:14.027421
Z",
"shell.execute_reply": "2024-11-0
5T05:22:14.027003
Z"
"iopub.execute_input": "2024-11-0
7T18:46:51.451177
Z",
"iopub.status.busy": "2024-11-0
7T18:46:51.450952
Z",
"iopub.status.idle": "2024-11-0
7T18:46:51.497530
Z",
"shell.execute_reply": "2024-11-0
7T18:46:51.496850
Z"
}
},
"outputs": [],
...
...
docs/backend/openai_api_completions.ipynb
View file @
c77c1e05
...
...
@@ -39,10 +39,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:30.637832
Z",
"iopub.status.busy": "2024-11-0
5T05:09:30.637709
Z",
"iopub.status.idle": "2024-11-0
5T05:09:58.830158
Z",
"shell.execute_reply": "2024-11-0
5T05:09:58.829395
Z"
"iopub.execute_input": "2024-11-0
7T18:46:54.813876
Z",
"iopub.status.busy": "2024-11-0
7T18:46:54.813741
Z",
"iopub.status.idle": "2024-11-0
7T18:47:24.015527
Z",
"shell.execute_reply": "2024-11-0
7T18:47:24.014987
Z"
}
},
"outputs": [],
...
...
@@ -79,10 +79,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:09:58.833008
Z",
"iopub.status.busy": "2024-11-0
5T05:09:58.83280
5Z",
"iopub.status.idle": "2024-11-0
5T05:10:00.187146
Z",
"shell.execute_reply": "2024-11-0
5T05:10:00.18665
7Z"
"iopub.execute_input": "2024-11-0
7T18:47:24.018153
Z",
"iopub.status.busy": "2024-11-0
7T18:47:24.01775
5Z",
"iopub.status.idle": "2024-11-0
7T18:47:25.374821
Z",
"shell.execute_reply": "2024-11-0
7T18:47:25.37439
7Z"
}
},
"outputs": [],
...
...
@@ -119,10 +119,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:00.189444
Z",
"iopub.status.busy": "2024-11-0
5T05:10:00.189289
Z",
"iopub.status.idle": "2024-11-0
5T05:10:03.291891
Z",
"shell.execute_reply": "2024-11-0
5T05:10:03.291173
Z"
"iopub.execute_input": "2024-11-0
7T18:47:25.376617
Z",
"iopub.status.busy": "2024-11-0
7T18:47:25.376495
Z",
"iopub.status.idle": "2024-11-0
7T18:47:28.482537
Z",
"shell.execute_reply": "2024-11-0
7T18:47:28.482125
Z"
}
},
"outputs": [],
...
...
@@ -165,10 +165,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:03.29438
9Z",
"iopub.status.busy": "2024-11-0
5T05:10:03.294237
Z",
"iopub.status.idle": "2024-11-0
5T05:10:03.469357
Z",
"shell.execute_reply": "2024-11-0
5T05:10:03.468661
Z"
"iopub.execute_input": "2024-11-0
7T18:47:28.48481
9Z",
"iopub.status.busy": "2024-11-0
7T18:47:28.484673
Z",
"iopub.status.idle": "2024-11-0
7T18:47:28.659814
Z",
"shell.execute_reply": "2024-11-0
7T18:47:28.659435
Z"
}
},
"outputs": [],
...
...
@@ -198,10 +198,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:03.471573
Z",
"iopub.status.busy": "2024-11-0
5T05:10:03.47143
0Z",
"iopub.status.idle": "2024-11-0
5T05:10:04.977081
Z",
"shell.execute_reply": "2024-11-0
5T05:10:04.976391
Z"
"iopub.execute_input": "2024-11-0
7T18:47:28.661844
Z",
"iopub.status.busy": "2024-11-0
7T18:47:28.66171
0Z",
"iopub.status.idle": "2024-11-0
7T18:47:30.168922
Z",
"shell.execute_reply": "2024-11-0
7T18:47:30.168600
Z"
}
},
"outputs": [],
...
...
@@ -234,10 +234,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:04.979428
Z",
"iopub.status.busy": "2024-11-0
5T05:10:04.979272
Z",
"iopub.status.idle": "2024-11-0
5T05:10:08.568761
Z",
"shell.execute_reply": "2024-11-0
5T05:10:08.568355
Z"
"iopub.execute_input": "2024-11-0
7T18:47:30.171319
Z",
"iopub.status.busy": "2024-11-0
7T18:47:30.171176
Z",
"iopub.status.idle": "2024-11-0
7T18:47:33.760113
Z",
"shell.execute_reply": "2024-11-0
7T18:47:33.759713
Z"
}
},
"outputs": [],
...
...
@@ -273,10 +273,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:08.571102
Z",
"iopub.status.busy": "2024-11-0
5T05:10:08.570964
Z",
"iopub.status.idle": "2024-11-0
5T05:10:23.214087
Z",
"shell.execute_reply": "2024-11-0
5T05:10:23.213664
Z"
"iopub.execute_input": "2024-11-0
7T18:47:33.762729
Z",
"iopub.status.busy": "2024-11-0
7T18:47:33.762590
Z",
"iopub.status.idle": "2024-11-0
7T18:47:34.255316
Z",
"shell.execute_reply": "2024-11-0
7T18:47:34.254907
Z"
}
},
"outputs": [],
...
...
@@ -297,7 +297,10 @@
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"Give me the information of the capital of France in the JSON format.\"},\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
" },\n",
" ],\n",
" temperature=0,\n",
" max_tokens=128,\n",
...
...
@@ -322,10 +325,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:23.216229
Z",
"iopub.status.busy": "2024-11-0
5T05:10:23.21607
6Z",
"iopub.status.idle": "2024-11-0
5T05:10:23.88423
6Z",
"shell.execute_reply": "2024-11-0
5T05:10:23.883897
Z"
"iopub.execute_input": "2024-11-0
7T18:47:34.257393
Z",
"iopub.status.busy": "2024-11-0
7T18:47:34.25724
6Z",
"iopub.status.idle": "2024-11-0
7T18:47:34.41350
6Z",
"shell.execute_reply": "2024-11-0
7T18:47:34.413172
Z"
}
},
"outputs": [],
...
...
@@ -365,10 +368,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:23.88627
6Z",
"iopub.status.busy": "2024-11-0
5T05:10:23.886136
Z",
"iopub.status.idle": "2024-11-0
5T05:10:23.905880
Z",
"shell.execute_reply": "2024-11-0
5T05:10:23.905529
Z"
"iopub.execute_input": "2024-11-0
7T18:47:34.41481
6Z",
"iopub.status.busy": "2024-11-0
7T18:47:34.414541
Z",
"iopub.status.idle": "2024-11-0
7T18:47:34.431341
Z",
"shell.execute_reply": "2024-11-0
7T18:47:34.431081
Z"
}
},
"outputs": [],
...
...
@@ -427,10 +430,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:23.907468
Z",
"iopub.status.busy": "2024-11-0
5T05:10:23.907247
Z",
"iopub.status.idle": "2024-11-0
5T05:10:26.920212
Z",
"shell.execute_reply": "2024-11-0
5T05:10:26.919865
Z"
"iopub.execute_input": "2024-11-0
7T18:47:34.432325
Z",
"iopub.status.busy": "2024-11-0
7T18:47:34.432208
Z",
"iopub.status.idle": "2024-11-0
7T18:47:37.444337
Z",
"shell.execute_reply": "2024-11-0
7T18:47:37.444000
Z"
}
},
"outputs": [],
...
...
@@ -482,10 +485,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:26.922675
Z",
"iopub.status.busy": "2024-11-0
5T05:10:26.922413
Z",
"iopub.status.idle": "2024-11-0
5T05:10:51.961703
Z",
"shell.execute_reply": "2024-11-0
5T05:10:51.960846
Z"
"iopub.execute_input": "2024-11-0
7T18:47:37.445894
Z",
"iopub.status.busy": "2024-11-0
7T18:47:37.445744
Z",
"iopub.status.idle": "2024-11-0
7T18:48:02.482532
Z",
"shell.execute_reply": "2024-11-0
7T18:48:02.482042
Z"
}
},
"outputs": [],
...
...
@@ -565,10 +568,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:10:51.964749
Z",
"iopub.status.busy": "2024-11-0
5T05:10:51.964215
Z",
"iopub.status.idle": "2024-11-0
5T05:11:05.023450
Z",
"shell.execute_reply": "2024-11-0
5T05:11:05.023101
Z"
"iopub.execute_input": "2024-11-0
7T18:48:02.485206
Z",
"iopub.status.busy": "2024-11-0
7T18:48:02.485064
Z",
"iopub.status.idle": "2024-11-0
7T18:48:15.521489
Z",
"shell.execute_reply": "2024-11-0
7T18:48:15.521156
Z"
}
},
"outputs": [],
...
...
@@ -660,10 +663,10 @@
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:05.024877
Z",
"iopub.status.busy": "2024-11-0
5T05:11:05.024561
Z",
"iopub.status.idle": "2024-11-0
5T05:11:06.358695
Z",
"shell.execute_reply": "2024-11-0
5T05:11:06.357635
Z"
"iopub.execute_input": "2024-11-0
7T18:48:15.522794
Z",
"iopub.status.busy": "2024-11-0
7T18:48:15.522657
Z",
"iopub.status.idle": "2024-11-0
7T18:48:16.875740
Z",
"shell.execute_reply": "2024-11-0
7T18:48:16.874847
Z"
}
},
"outputs": [],
...
...
docs/backend/openai_api_embeddings.ipynb
View file @
c77c1e05
...
...
@@ -35,10 +35,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:17.227174
Z",
"iopub.status.busy": "2024-11-0
5T05:22:17.226952
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.44579
1Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.444980
Z"
"iopub.execute_input": "2024-11-0
7T18:48:21.128020
Z",
"iopub.status.busy": "2024-11-0
7T18:48:21.127898
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.31037
1Z",
"shell.execute_reply": "2024-11-0
7T18:48:45.309469
Z"
}
},
"outputs": [],
...
...
@@ -72,10 +72,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.448147
Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.447775
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.495311
Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.495027
Z"
"iopub.execute_input": "2024-11-0
7T18:48:45.313506
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.313123
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.364918
Z",
"shell.execute_reply": "2024-11-0
7T18:48:45.364155
Z"
}
},
"outputs": [],
...
...
@@ -106,10 +106,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.49666
6Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.496524
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.540687
Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.540060
Z"
"iopub.execute_input": "2024-11-0
7T18:48:45.36777
6Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.367490
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.411386
Z",
"shell.execute_reply": "2024-11-0
7T18:48:45.411134
Z"
}
},
"outputs": [],
...
...
@@ -140,10 +140,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.542551
Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.542282
Z",
"iopub.status.idle": "2024-11-0
5T05:22:42.928542
Z",
"shell.execute_reply": "2024-11-0
5T05:22:42.928181
Z"
"iopub.execute_input": "2024-11-0
7T18:48:45.412462
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.412351
Z",
"iopub.status.idle": "2024-11-0
7T18:48:45.768796
Z",
"shell.execute_reply": "2024-11-0
7T18:48:45.768406
Z"
}
},
"outputs": [],
...
...
@@ -176,10 +176,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:42.930093
Z",
"iopub.status.busy": "2024-11-0
5T05:22:42.929954
Z",
"iopub.status.idle": "2024-11-0
5T05:22:44.79994
5Z",
"shell.execute_reply": "2024-11-0
5T05:22:44.799562
Z"
"iopub.execute_input": "2024-11-0
7T18:48:45.770227
Z",
"iopub.status.busy": "2024-11-0
7T18:48:45.770106
Z",
"iopub.status.idle": "2024-11-0
7T18:48:47.44706
5Z",
"shell.execute_reply": "2024-11-0
7T18:48:47.446733
Z"
}
},
"outputs": [],
...
...
@@ -208,10 +208,10 @@
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22:44.801418
Z",
"iopub.status.busy": "2024-11-0
5T05:22:44.801192
Z",
"iopub.status.idle": "2024-11-0
5T05:22:45.094634
Z",
"shell.execute_reply": "2024-11-0
5T05:22:45.093950
Z"
"iopub.execute_input": "2024-11-0
7T18:48:47.448510
Z",
"iopub.status.busy": "2024-11-0
7T18:48:47.448337
Z",
"iopub.status.idle": "2024-11-0
7T18:48:47.743336
Z",
"shell.execute_reply": "2024-11-0
7T18:48:47.742276
Z"
}
},
"outputs": [],
...
...
docs/backend/openai_api_vision.ipynb
View file @
c77c1e05
...
...
@@ -39,10 +39,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:22
:4
9
.3
20999
Z",
"iopub.status.busy": "2024-11-0
5T05:22
:4
9
.3
20880
Z",
"iopub.status.idle": "2024-11-0
5T05:23:21.537478
Z",
"shell.execute_reply": "2024-11-0
5T05:23:21.536956
Z"
"iopub.execute_input": "2024-11-0
7T18:43
:4
7
.3
11708
Z",
"iopub.status.busy": "2024-11-0
7T18:43
:4
7
.3
11517
Z",
"iopub.status.idle": "2024-11-0
7T18:44:18.512576
Z",
"shell.execute_reply": "2024-11-0
7T18:44:18.511909
Z"
}
},
"outputs": [],
...
...
@@ -78,10 +78,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:21.539953
Z",
"iopub.status.busy": "2024-11-0
5T05:23:21.539100
Z",
"iopub.status.idle": "2024-11-0
5T05:23
:2
5
.880
1
79Z",
"shell.execute_reply": "2024-11-0
5T05:23
:2
5
.8
79744
Z"
"iopub.execute_input": "2024-11-0
7T18:44:18.515678
Z",
"iopub.status.busy": "2024-11-0
7T18:44:18.515314
Z",
"iopub.status.idle": "2024-11-0
7T18:44
:2
2
.88079
3
Z",
"shell.execute_reply": "2024-11-0
7T18:44
:2
2
.8
80303
Z"
}
},
"outputs": [],
...
...
@@ -129,10 +129,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23
:2
5
.88
1742
Z",
"iopub.status.busy": "2024-11-0
5T05:23
:2
5
.88
1595
Z",
"iopub.status.idle": "2024-11-0
5T05:23:26.758503
Z",
"shell.execute_reply": "2024-11-0
5T05:23:26.75
80
8
4Z"
"iopub.execute_input": "2024-11-0
7T18:44
:2
2
.88
3309
Z",
"iopub.status.busy": "2024-11-0
7T18:44
:2
2
.88
3160
Z",
"iopub.status.idle": "2024-11-0
7T18:44:27.048810
Z",
"shell.execute_reply": "2024-11-0
7T18:44:27.04
80
7
4Z"
}
},
"outputs": [],
...
...
@@ -176,10 +176,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:26.760098
Z",
"iopub.status.busy": "2024-11-0
5T05:23:26.759955
Z",
"iopub.status.idle": "2024-11-0
5T05:23:27.849510
Z",
"shell.execute_reply": "2024-11-0
5T05:23:27.849117
Z"
"iopub.execute_input": "2024-11-0
7T18:44:27.051312
Z",
"iopub.status.busy": "2024-11-0
7T18:44:27.051190
Z",
"iopub.status.idle": "2024-11-0
7T18:44:32.358097
Z",
"shell.execute_reply": "2024-11-0
7T18:44:32.357628
Z"
}
},
"outputs": [],
...
...
@@ -227,10 +227,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:27.850994
Z",
"iopub.status.busy": "2024-11-0
5T05:23:27.850864
Z",
"iopub.status.idle": "2024-11-0
5T05:23:31.609137
Z",
"shell.execute_reply": "2024-11-0
5T05:23:31.608748
Z"
"iopub.execute_input": "2024-11-0
7T18:44:32.359532
Z",
"iopub.status.busy": "2024-11-0
7T18:44:32.359413
Z",
"iopub.status.idle": "2024-11-0
7T18:44:36.164664
Z",
"shell.execute_reply": "2024-11-0
7T18:44:36.164005
Z"
}
},
"outputs": [],
...
...
@@ -276,10 +276,10 @@
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:23:31.61068
3Z",
"iopub.status.busy": "2024-11-0
5T05:23:31.610560
Z",
"iopub.status.idle": "2024-11-0
5T05:23:32.965146
Z",
"shell.execute_reply": "2024-11-0
5T05:23:32.963922
Z"
"iopub.execute_input": "2024-11-0
7T18:44:36.16712
3Z",
"iopub.status.busy": "2024-11-0
7T18:44:36.166535
Z",
"iopub.status.idle": "2024-11-0
7T18:44:37.743761
Z",
"shell.execute_reply": "2024-11-0
7T18:44:37.742510
Z"
}
},
"outputs": [],
...
...
docs/conf.py
View file @
c77c1e05
...
...
@@ -31,7 +31,7 @@ extensions = [
]
nbsphinx_allow_errors
=
True
nbsphinx_execute
=
'
never
'
nbsphinx_execute
=
"
never
"
autosectionlabel_prefix_document
=
True
nbsphinx_allow_directives
=
True
...
...
@@ -49,7 +49,7 @@ myst_enable_extensions = [
myst_heading_anchors
=
3
nbsphinx_kernel_name
=
'
python3
'
nbsphinx_kernel_name
=
"
python3
"
nbsphinx_execute_arguments
=
[
"--InlineBackend.figure_formats={'svg', 'pdf'}"
,
"--InlineBackend.rc={'figure.dpi': 96}"
,
...
...
@@ -130,8 +130,10 @@ html_context = {
html_static_path
=
[
"_static"
]
html_css_files
=
[
"css/custom_log.css"
]
def
setup
(
app
):
app
.
add_css_file
(
'css/custom_log.css'
)
app
.
add_css_file
(
"css/custom_log.css"
)
myst_enable_extensions
=
[
"dollarmath"
,
...
...
docs/start/send_request.ipynb
View file @
c77c1e05
...
...
@@ -33,10 +33,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:10.680191
Z",
"iopub.status.busy": "2024-11-0
5T05:11:10.6797
10Z",
"iopub.status.idle": "2024-11-0
5T05:11:39.882385
Z",
"shell.execute_reply": "2024-11-0
5T05:11:39.881827
Z"
"iopub.execute_input": "2024-11-0
7T18:48:52.032229
Z",
"iopub.status.busy": "2024-11-0
7T18:48:52.032
10
5
Z",
"iopub.status.idle": "2024-11-0
7T18:49:20.226042
Z",
"shell.execute_reply": "2024-11-0
7T18:49:20.225562
Z"
}
},
"outputs": [],
...
...
@@ -49,7 +49,7 @@
")\n",
"\n",
"server_process = execute_shell_command(\n",
"\"\"\"\n",
"
\"\"\"\n",
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
"--port 30000 --host 0.0.0.0\n",
"\"\"\"\n",
...
...
@@ -70,10 +70,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:39.883923
Z",
"iopub.status.busy": "2024-11-0
5T05:11:39.883
72
1
Z",
"iopub.status.idle": "2024-11-0
5T05:11:40.124980
Z",
"shell.execute_reply": "2024-11-0
5T05:11:40.124557
Z"
"iopub.execute_input": "2024-11-0
7T18:49:20.228006
Z",
"iopub.status.busy": "2024-11-0
7T18:49:20.2275
72Z",
"iopub.status.idle": "2024-11-0
7T18:49:20.469885
Z",
"shell.execute_reply": "2024-11-0
7T18:49:20.469518
Z"
}
},
"outputs": [],
...
...
@@ -101,10 +101,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:40.126
56
4
Z",
"iopub.status.busy": "2024-11-0
5T05:11:40.126369
Z",
"iopub.status.idle": "2024-11-0
5T05:11:40.324316
Z",
"shell.execute_reply": "2024-11-0
5T05:11:40.323693
Z"
"iopub.execute_input": "2024-11-0
7T18:49:20.4719
56Z",
"iopub.status.busy": "2024-11-0
7T18:49:20.471811
Z",
"iopub.status.idle": "2024-11-0
7T18:49:20.667997
Z",
"shell.execute_reply": "2024-11-0
7T18:49:20.667630
Z"
}
},
"outputs": [],
...
...
@@ -115,9 +115,7 @@
"\n",
"data = {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"messages\": [\n",
" {\"role\": \"user\", \"content\": \"What is the capital of France?\"}\n",
" ]\n",
" \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
"}\n",
"\n",
"response = requests.post(url, json=data)\n",
...
...
@@ -136,10 +134,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:40.327043
Z",
"iopub.status.busy": "2024-11-0
5T05:11:40.326759
Z",
"iopub.status.idle": "2024-11-0
5T05:11:41.687336
Z",
"shell.execute_reply": "2024-11-0
5T05:11:41.686855
Z"
"iopub.execute_input": "2024-11-0
7T18:49:20.669977
Z",
"iopub.status.busy": "2024-11-0
7T18:49:20.669826
Z",
"iopub.status.idle": "2024-11-0
7T18:49:22.004855
Z",
"shell.execute_reply": "2024-11-0
7T18:49:22.004472
Z"
}
},
"outputs": [],
...
...
@@ -171,10 +169,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:41.688676
Z",
"iopub.status.busy": "2024-11-0
5T05:11:41.688527
Z",
"iopub.status.idle": "2024-11-0
5T05:11:42.717140
Z",
"shell.execute_reply": "2024-11-0
5T05:11:42.716452
Z"
"iopub.execute_input": "2024-11-0
7T18:49:22.006983
Z",
"iopub.status.busy": "2024-11-0
7T18:49:22.006858
Z",
"iopub.status.idle": "2024-11-0
7T18:49:23.029098
Z",
"shell.execute_reply": "2024-11-0
7T18:49:23.028697
Z"
}
},
"outputs": [],
...
...
@@ -197,7 +195,7 @@
"# Handle the streaming output\n",
"for chunk in response:\n",
" if chunk.choices[0].delta.content:\n",
" print(chunk.choices[0].delta.content, end=
''
, flush=True)"
" print(chunk.choices[0].delta.content, end=
\"\"
, flush=True)"
]
},
{
...
...
@@ -214,10 +212,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:42.720467
Z",
"iopub.status.busy": "2024-11-0
5T05:11:42.720182
Z",
"iopub.status.idle": "2024-11-0
5T05:11:43.480765
Z",
"shell.execute_reply": "2024-11-0
5T05:11:43.480143
Z"
"iopub.execute_input": "2024-11-0
7T18:49:23.031712
Z",
"iopub.status.busy": "2024-11-0
7T18:49:23.031571
Z",
"iopub.status.idle": "2024-11-0
7T18:49:23.787752
Z",
"shell.execute_reply": "2024-11-0
7T18:49:23.787368
Z"
}
},
"outputs": [],
...
...
@@ -250,10 +248,10 @@
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:43.483575
Z",
"iopub.status.busy": "2024-11-0
5T05:11:43.483295
Z",
"iopub.status.idle": "2024-11-0
5T05:11:44.242950
Z",
"shell.execute_reply": "2024-11-0
5T05:11:44.242
24
8
Z"
"iopub.execute_input": "2024-11-0
7T18:49:23.789840
Z",
"iopub.status.busy": "2024-11-0
7T18:49:23.789702
Z",
"iopub.status.idle": "2024-11-0
7T18:49:24.545631
Z",
"shell.execute_reply": "2024-11-0
7T18:49:24.545
24
1
Z"
}
},
"outputs": [],
...
...
@@ -290,10 +288,10 @@
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-0
5T05:11:44.245660
Z",
"iopub.status.busy": "2024-11-0
5T05:11:44.245373
Z",
"iopub.status.idle": "2024-11-0
5T05:11:45.591682
Z",
"shell.execute_reply": "2024-11-0
5T05:11:45.59
11
8
4Z"
"iopub.execute_input": "2024-11-0
7T18:49:24.547641
Z",
"iopub.status.busy": "2024-11-0
7T18:49:24.547497
Z",
"iopub.status.idle": "2024-11-0
7T18:49:25.888864
Z",
"shell.execute_reply": "2024-11-0
7T18:49:25.888
114Z"
}
},
"outputs": [],
...
...
examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
View file @
c77c1e05
...
...
@@ -71,7 +71,7 @@
"source": [
"import json\n",
"import os\n",
"from typing import List\n",
"from typing import List\n",
"\n",
"import chromadb\n",
"\n",
...
...
@@ -80,7 +80,7 @@
"if not os.path.exists(path_qca):\n",
" !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n",
"\n",
"with open(path_qca,
'r'
) as f:\n",
"with open(path_qca,
\"r\"
) as f:\n",
" question_context_answers = json.load(f)\n",
"\n",
"chroma_client = chromadb.PersistentClient()\n",
...
...
@@ -88,7 +88,7 @@
"if collection.count() == 0:\n",
" collection.add(\n",
" documents=[qca[\"context\"] for qca in question_context_answers],\n",
" ids=[str(i) for i in range(len(question_context_answers))]\n",
" ids=[str(i) for i in range(len(question_context_answers))]
,
\n",
" )"
],
"metadata": {
...
...
@@ -123,7 +123,7 @@
"\n",
"load_dotenv()\n",
"\n",
"os.environ[
'
TOKENIZERS_PARALLELISM
'
] = \"false\"\n",
"os.environ[
\"
TOKENIZERS_PARALLELISM
\"
] = \"false\"\n",
"\n",
"p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n",
"p.integrate_with_sglang()\n",
...
...
@@ -150,10 +150,7 @@
"source": [
"@trace\n",
"def retrieval(question: str) -> List[str]:\n",
" return collection.query(\n",
" query_texts=[question],\n",
" n_results=1\n",
" )['documents'][0]"
" return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
],
"metadata": {
"collapsed": false
...
...
@@ -176,7 +173,9 @@
"@function\n",
"def generation_sglang(s, question: str, *context: str):\n",
" context = \"\\n\".join(context)\n",
" s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n",
" s += user(\n",
" f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
" )\n",
" s += assistant(gen(\"answer\"))\n",
"\n",
"\n",
...
...
@@ -223,7 +222,9 @@
" return generation(question, *contexts)\n",
"\n",
"\n",
"rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")"
"rag_pipeline(\n",
" \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
")"
]
},
{
...
...
@@ -271,7 +272,10 @@
"execution_count": null,
"outputs": [],
"source": [
"from parea.evals.rag import context_query_relevancy_factory, percent_target_supported_by_context_factory\n",
"from parea.evals.rag import (\n",
" context_query_relevancy_factory,\n",
" percent_target_supported_by_context_factory,\n",
")\n",
"\n",
"\n",
"context_relevancy_eval = context_query_relevancy_factory()\n",
...
...
@@ -280,10 +284,7 @@
"\n",
"@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n",
"def retrieval(question: str) -> List[str]:\n",
" return collection.query(\n",
" query_texts=[question],\n",
" n_results=1\n",
" )['documents'][0]"
" return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
],
"metadata": {
"collapsed": false
...
...
@@ -310,10 +311,13 @@
"answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n",
"answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n",
"\n",
"\n",
"@function\n",
"def generation_sglang(s, question: str, *context: str):\n",
" context = \"\\n\".join(context)\n",
" s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n",
" s += user(\n",
" f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
" )\n",
" s += assistant(gen(\"answer\", max_tokens=1_000))\n",
"\n",
"\n",
...
...
@@ -357,7 +361,9 @@
" return generation(question, *contexts)\n",
"\n",
"\n",
"rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")"
"rag_pipeline(\n",
" \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
")"
],
"metadata": {
"collapsed": false
...
...
@@ -402,6 +408,7 @@
"source": [
"!pip install nest-asyncio\n",
"import nest_asyncio\n",
"\n",
"nest_asyncio.apply()"
],
"metadata": {
...
...
@@ -461,7 +468,7 @@
],
"source": [
"e = p.experiment(\n",
"
'
RAG
'
,\n",
"
\"
RAG
\"
,\n",
" data=[\n",
" {\n",
" \"question\": qca[\"question\"],\n",
...
...
@@ -469,7 +476,7 @@
" }\n",
" for qca in question_context_answers\n",
" ],\n",
" func=rag_pipeline\n",
" func=rag_pipeline
,
\n",
").run()"
],
"metadata": {
...
...
examples/runtime/engine/input_ids.py
View file @
c77c1e05
...
...
@@ -7,6 +7,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
MODEL_PATH
=
"meta-llama/Llama-3.1-8B-Instruct"
def
main
():
# Sample prompts.
prompts
=
[
...
...
python/sglang/srt/configs/model_config.py
View file @
c77c1e05
...
...
@@ -39,7 +39,7 @@ class ModelConfig:
revision
:
Optional
[
str
]
=
None
,
context_length
:
Optional
[
int
]
=
None
,
model_override_args
:
Optional
[
dict
]
=
None
,
is_embedding
:
Optional
[
bool
]
=
None
is_embedding
:
Optional
[
bool
]
=
None
,
)
->
None
:
# Parse args
self
.
model_override_args
=
json
.
loads
(
model_override_args
)
...
...
@@ -52,7 +52,9 @@ class ModelConfig:
self
.
hf_text_config
=
get_hf_text_config
(
self
.
hf_config
)
# Check model type
self
.
is_generation
=
is_generation_model
(
self
.
hf_config
.
architectures
,
is_embedding
)
self
.
is_generation
=
is_generation_model
(
self
.
hf_config
.
architectures
,
is_embedding
)
self
.
is_multimodal
=
is_multimodal_model
(
self
.
hf_config
.
architectures
)
self
.
is_encoder_decoder
=
is_encoder_decoder_model
(
self
.
hf_config
.
architectures
)
...
...
python/sglang/srt/layers/quantization/base_config.py
View file @
c77c1e05
...
...
@@ -122,16 +122,14 @@ class QuantizationConfig(ABC):
"""
raise
NotImplementedError
def
method_has_implemented_embedding
(
method_class
:
Type
[
QuantizeMethodBase
])
->
bool
:
def
method_has_implemented_embedding
(
method_class
:
Type
[
QuantizeMethodBase
])
->
bool
:
"""
Not all quant methods have embedding implemented, so we need to check that
it exists for our given method. We check this by making sure the function
has been changed from the base implementation.
"""
base_embedding
=
inspect
.
getattr_static
(
QuantizeMethodBase
,
"embedding"
,
None
)
base_embedding
=
inspect
.
getattr_static
(
QuantizeMethodBase
,
"embedding"
,
None
)
class_embedding
=
inspect
.
getattr_static
(
method_class
,
"embedding"
,
None
)
return
(
class_embedding
is
not
None
and
class_embedding
is
not
base_embedding
)
return
class_embedding
is
not
None
and
class_embedding
is
not
base_embedding
python/sglang/srt/layers/vocab_parallel_embedding.py
View file @
c77c1e05
This diff is collapsed.
Click to expand it.
python/sglang/srt/managers/io_struct.py
View file @
c77c1e05
...
...
@@ -86,8 +86,10 @@ class GenerateReqInput:
self
.
parallel_sample_num
=
self
.
sampling_params
.
get
(
"n"
,
1
)
else
:
# isinstance(self.sampling_params, list):
self
.
parallel_sample_num
=
self
.
sampling_params
[
0
].
get
(
"n"
,
1
)
assert
all
(
self
.
parallel_sample_num
==
sampling_params
.
get
(
"n"
,
1
)
for
sampling_params
in
self
.
sampling_params
),
(
"The parallel_sample_num should be the same for all samples in sample params."
)
assert
all
(
self
.
parallel_sample_num
==
sampling_params
.
get
(
"n"
,
1
)
for
sampling_params
in
self
.
sampling_params
),
"The parallel_sample_num should be the same for all samples in sample params."
if
self
.
parallel_sample_num
>
1
and
self
.
is_single
:
self
.
is_single
=
False
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
c77c1e05
...
...
@@ -911,8 +911,7 @@ class ScheduleBatch:
keep_indices
=
[
i
for
i
in
range
(
len
(
self
.
reqs
))
if
not
self
.
reqs
[
i
].
finished
()
and
self
.
reqs
[
i
]
is
not
being_chunked_req
if
not
self
.
reqs
[
i
].
finished
()
and
self
.
reqs
[
i
]
is
not
being_chunked_req
]
if
keep_indices
is
None
or
len
(
keep_indices
)
==
0
:
...
...
@@ -1043,6 +1042,7 @@ class ScheduleBatch:
for
req
in
self
.
reqs
:
req
.
started_time
=
time
.
time
()
@
dataclasses
.
dataclass
class
ModelWorkerBatch
:
# The batch id
...
...
python/sglang/srt/managers/scheduler.py
View file @
c77c1e05
...
...
@@ -224,8 +224,8 @@ class Scheduler:
self
.
forward_ct
=
0
self
.
forward_ct_decode
=
0
self
.
num_generated_tokens
=
0
self
.
last_stats_tic
=
time
.
time
()
# time of last stats for every iter
self
.
last_log_tic
=
time
.
time
()
# time of last log for print decode log
self
.
last_stats_tic
=
time
.
time
()
# time of last stats for every iter
self
.
last_log_tic
=
time
.
time
()
# time of last log for print decode log
self
.
stream_interval
=
server_args
.
stream_interval
# Init chunked prefill
...
...
@@ -566,9 +566,7 @@ class Scheduler:
and
not
self
.
last_batch
.
is_empty
()
):
if
self
.
being_chunked_req
:
self
.
last_batch
.
filter_batch
(
being_chunked_req
=
self
.
being_chunked_req
)
self
.
last_batch
.
filter_batch
(
being_chunked_req
=
self
.
being_chunked_req
)
self
.
tree_cache
.
cache_unfinished_req
(
self
.
being_chunked_req
)
# Inflight request keeps its rid but will get a new req_pool_idx.
self
.
req_to_token_pool
.
free
(
self
.
being_chunked_req
.
req_pool_idx
)
...
...
@@ -628,9 +626,7 @@ class Scheduler:
has_inflight
=
self
.
being_chunked_req
is
not
None
if
has_inflight
:
self
.
being_chunked_req
.
init_next_round_input
()
self
.
being_chunked_req
=
adder
.
add_inflight_req
(
self
.
being_chunked_req
)
self
.
being_chunked_req
=
adder
.
add_inflight_req
(
self
.
being_chunked_req
)
if
self
.
lora_paths
:
lora_set
=
(
...
...
@@ -813,7 +809,8 @@ class Scheduler:
embeddings
=
self
.
tp_worker
.
forward_batch_embedding
(
model_worker_batch
)
ret
=
embeddings
,
model_worker_batch
.
bid
return
ret
def
get_stats
(
self
,
batch
:
ScheduleBatch
):
def
get_stats
(
self
,
batch
:
ScheduleBatch
):
# TODO: get stats for chunked prefill
now
=
time
.
time
()
...
...
@@ -829,8 +826,8 @@ class Scheduler:
# set stats from prefill
if
self
.
stats
is
not
None
:
# new_seq=self.stats.new_seq
cache_hit_rate
=
self
.
stats
.
cache_hit_rate
token_usage
=
self
.
stats
.
token_usage
cache_hit_rate
=
self
.
stats
.
cache_hit_rate
token_usage
=
self
.
stats
.
token_usage
# Iteration stats
num_prompt_tokens_iter
=
0
num_generation_tokens_iter
=
0
...
...
@@ -851,15 +848,19 @@ class Scheduler:
# _, next_token_ids, _ = result
if
batch
is
not
None
:
num_generation_tokens_iter
=
len
(
batch
.
output_ids
)
gen_throughput
=
round
(
num_generation_tokens_iter
/
(
now
-
self
.
last_stats_tic
),
2
)
gen_throughput
=
round
(
num_generation_tokens_iter
/
(
now
-
self
.
last_stats_tic
),
2
)
for
i
,
req
in
enumerate
(
batch
.
reqs
):
# NOTE: Batch forward mode is extend befor start decode,
if
batch
.
forward_mode
.
is_extend
():
num_prompt_tokens_iter
=
len
(
batch
.
input_ids
)
+
sum
(
batch
.
prefix_lens
)
num_prompt_tokens_iter
=
len
(
batch
.
input_ids
)
+
sum
(
batch
.
prefix_lens
)
time_to_first_tokens_iter
.
append
(
now
-
req
.
started_time
)
else
:
time_per_output_tokens_iter
.
append
(
now
-
self
.
last_stats_tic
)
time_per_output_tokens_iter
.
append
(
now
-
self
.
last_stats_tic
)
if
req
.
finished
():
time_e2e_requests
.
append
(
now
-
req
.
created_time
)
...
...
@@ -867,9 +868,10 @@ class Scheduler:
num_prompt_tokens_requests
.
append
(
len
(
req
.
origin_input_ids
))
num_generation_tokens_requests
.
append
(
len
(
req
.
output_ids
))
finished_reason_requests
.
append
(
req
.
finished_reason
.
to_json
()
if
req
.
finished_reason
is
not
None
else
None
)
req
.
finished_reason
.
to_json
()
if
req
.
finished_reason
is
not
None
else
None
)
return
Stats
(
new_seq
=
new_seq
,
...
...
@@ -893,7 +895,7 @@ class Scheduler:
max_running_requests
=
self
.
max_running_requests
,
)
def
log_stats
(
self
,
stats
:
Stats
):
def
log_stats
(
self
,
stats
:
Stats
):
self
.
metrics_collector
.
log_stats
(
stats
)
def
process_batch_result
(
self
,
batch
:
ScheduleBatch
,
result
):
...
...
@@ -1003,9 +1005,7 @@ class Scheduler:
if
req
.
is_retracted
:
continue
if
self
.
server_args
.
enable_overlap_schedule
and
(
req
.
finished
()
):
if
self
.
server_args
.
enable_overlap_schedule
and
(
req
.
finished
()):
self
.
token_to_kv_pool
.
free
(
batch
.
out_cache_loc
[
i
:
i
+
1
])
continue
...
...
@@ -1031,7 +1031,10 @@ class Scheduler:
self
.
token_to_kv_pool
.
free_group_end
()
self
.
forward_ct_decode
=
(
self
.
forward_ct_decode
+
1
)
%
(
1
<<
30
)
if
self
.
tp_rank
==
0
and
self
.
forward_ct_decode
%
self
.
server_args
.
decode_log_interval
==
0
:
if
(
self
.
tp_rank
==
0
and
self
.
forward_ct_decode
%
self
.
server_args
.
decode_log_interval
==
0
):
self
.
print_decode_stats
()
def
add_logprob_return_values
(
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
c77c1e05
...
...
@@ -215,7 +215,7 @@ class TokenizerManager:
logprob_start_len
,
top_logprobs_num
,
obj
.
stream
,
obj
.
lora_path
obj
.
lora_path
,
)
elif
isinstance
(
obj
,
EmbeddingReqInput
):
tokenized_obj
=
TokenizedEmbeddingReqInput
(
...
...
@@ -290,7 +290,9 @@ class TokenizerManager:
# Tokenize all requests
objs
=
[
obj
[
i
]
for
i
in
range
(
batch_size
)]
tokenized_objs
=
await
asyncio
.
gather
(
*
(
self
.
_tokenize_one_request
(
obj
)
for
obj
in
objs
))
tokenized_objs
=
await
asyncio
.
gather
(
*
(
self
.
_tokenize_one_request
(
obj
)
for
obj
in
objs
)
)
# Cache the common prefix for parallel sampling
for
i
in
range
(
batch_size
):
...
...
@@ -322,7 +324,9 @@ class TokenizerManager:
rid_to_index
=
{
rid
:
i
for
i
,
rid
in
enumerate
(
rids
)}
task_map
=
{
asyncio
.
create_task
(
gen
.
__anext__
()):
gen
for
gen
in
generators
}
while
task_map
:
done
,
_
=
await
asyncio
.
wait
(
task_map
.
keys
(),
return_when
=
asyncio
.
FIRST_COMPLETED
)
done
,
_
=
await
asyncio
.
wait
(
task_map
.
keys
(),
return_when
=
asyncio
.
FIRST_COMPLETED
)
for
task
in
done
:
gen
=
task_map
.
pop
(
task
)
...
...
@@ -367,7 +371,7 @@ class TokenizerManager:
if
self
.
server_args
.
dp_size
==
1
:
res
=
await
self
.
mem_pool_size
return
res
.
size
else
:
# self.server_args.dp_size > 1
else
:
# self.server_args.dp_size > 1
self
.
mem_pool_size_tmp
=
[]
res
=
await
self
.
mem_pool_size
ret
=
[
r
.
size
for
r
in
res
]
...
...
@@ -399,7 +403,7 @@ class TokenizerManager:
self
.
server_args
.
load_format
=
obj
.
load_format
self
.
model_path
=
obj
.
model_path
return
result
.
success
,
result
.
message
else
:
# self.server_args.dp_size > 1
else
:
# self.server_args.dp_size > 1
self
.
model_update_tmp
=
[]
result
=
await
self
.
model_update_result
...
...
@@ -470,7 +474,7 @@ class TokenizerManager:
if
isinstance
(
recv_obj
,
UpdateWeightReqOutput
):
if
self
.
server_args
.
dp_size
==
1
:
self
.
model_update_result
.
set_result
(
recv_obj
)
else
:
# self.server_args.dp_size > 1
else
:
# self.server_args.dp_size > 1
self
.
model_update_tmp
.
append
(
recv_obj
)
# set future if the all results are recevied
if
len
(
self
.
model_update_tmp
)
==
self
.
server_args
.
dp_size
:
...
...
@@ -479,7 +483,7 @@ class TokenizerManager:
elif
isinstance
(
recv_obj
,
GetMemPoolSizeReqOutput
):
if
self
.
server_args
.
dp_size
==
1
:
self
.
mem_pool_size
.
set_result
(
recv_obj
)
else
:
# self.sever_args.dp_size > 1
else
:
# self.sever_args.dp_size > 1
self
.
mem_pool_size_tmp
.
append
(
recv_obj
)
# set future if the all results are received
if
len
(
self
.
mem_pool_size_tmp
)
==
self
.
server_args
.
dp_size
:
...
...
python/sglang/srt/metrics/metrics_collector.py
View file @
c77c1e05
...
...
@@ -130,27 +130,65 @@ class Metrics:
self
.
counter_prompt_tokens
=
Counter
(
name
=
"sglang:prompt_tokens_total"
,
documentation
=
"Number of prefill tokens processed."
,
labelnames
=
labelnames
)
labelnames
=
labelnames
,
)
self
.
counter_generation_tokens
=
Counter
(
name
=
"sglang:generation_tokens_total"
,
documentation
=
"Number of generation tokens processed."
,
labelnames
=
labelnames
)
labelnames
=
labelnames
,
)
self
.
histogram_time_to_first_token
=
Histogram
(
name
=
"sglang:time_to_first_token_seconds"
,
documentation
=
"Histogram of time to first token in seconds."
,
labelnames
=
labelnames
,
buckets
=
[
0.001
,
0.005
,
0.01
,
0.02
,
0.04
,
0.06
,
0.08
,
0.1
,
0.25
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
15.0
,
20.0
,
25.0
,
30.0
])
0.001
,
0.005
,
0.01
,
0.02
,
0.04
,
0.06
,
0.08
,
0.1
,
0.25
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
15.0
,
20.0
,
25.0
,
30.0
,
],
)
self
.
histogram_time_per_output_token
=
Histogram
(
name
=
"sglang:time_per_output_token_seconds"
,
documentation
=
"Histogram of time per output token in seconds."
,
labelnames
=
labelnames
,
buckets
=
[
0.005
,
0.01
,
0.015
,
0.02
,
0.025
,
0.03
,
0.04
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
])
0.005
,
0.01
,
0.015
,
0.02
,
0.025
,
0.03
,
0.04
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
,
],
)
# Request Stats
# Metadata
...
...
@@ -245,14 +283,19 @@ class PrometheusMetricsCollector(MetricsCollector):
stats
.
num_generation_tokens_requests
,
)
self
.
_log_counter
(
self
.
metrics
.
counter_prompt_tokens
,
stats
.
num_prompt_tokens_iter
)
self
.
_log_counter
(
self
.
metrics
.
counter_generation_tokens
,
stats
.
num_generation_tokens_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_to_first_token
,
stats
.
time_to_first_tokens_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_per_output_token
,
stats
.
time_per_output_tokens_iter
)
self
.
_log_counter
(
self
.
metrics
.
counter_prompt_tokens
,
stats
.
num_prompt_tokens_iter
)
self
.
_log_counter
(
self
.
metrics
.
counter_generation_tokens
,
stats
.
num_generation_tokens_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_to_first_token
,
stats
.
time_to_first_tokens_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_per_output_token
,
stats
.
time_per_output_tokens_iter
,
)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
self
.
_log_gauge
(
self
.
metrics
.
num_running_sys
,
stats
.
num_running_req
)
...
...
python/sglang/srt/models/gpt2.py
View file @
c77c1e05
...
...
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
#from sglang.srt.layers.activation import get_act_fn
#
from sglang.srt.layers.activation import get_act_fn
from
sglang.srt.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
...
...
@@ -47,15 +47,14 @@ class GPT2Attention(nn.Module):
self
,
layer_id
:
int
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
total_num_heads
=
config
.
num_attention_heads
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
tensor_model_parallel_world_size
=
get_tensor_model_parallel_world_size
()
assert
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
self
.
head_dim
=
self
.
hidden_size
//
total_num_heads
...
...
@@ -76,11 +75,13 @@ class GPT2Attention(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
)
self
.
attn
=
RadixAttention
(
self
.
num_heads
,
self
.
head_dim
,
scaling
=
self
.
scale
,
num_kv_heads
=
total_num_heads
,
layer_id
=
layer_id
)
self
.
attn
=
RadixAttention
(
self
.
num_heads
,
self
.
head_dim
,
scaling
=
self
.
scale
,
num_kv_heads
=
total_num_heads
,
layer_id
=
layer_id
,
)
def
forward
(
self
,
...
...
@@ -119,10 +120,14 @@ class GPT2MLP(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
)
self
.
act
=
get_act_fn
(
config
.
activation_function
,
quant_config
,
intermediate_size
)
self
.
act
=
get_act_fn
(
config
.
activation_function
,
quant_config
,
intermediate_size
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
c_fc
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
,
_
=
self
.
c_proj
(
hidden_states
)
...
...
@@ -135,27 +140,20 @@ class GPT2Block(nn.Module):
self
,
layer_id
:
int
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
inner_dim
=
(
config
.
n_inner
if
config
.
n_inner
is
not
None
else
4
*
hidden_size
)
inner_dim
=
config
.
n_inner
if
config
.
n_inner
is
not
None
else
4
*
hidden_size
self
.
ln_1
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
GPT2Attention
(
layer_id
,
config
,
cache_config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
)
self
.
attn
=
GPT2Attention
(
layer_id
,
config
,
cache_config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
)
self
.
ln_2
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
GPT2MLP
(
inner_dim
,
config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
)
self
.
mlp
=
GPT2MLP
(
inner_dim
,
config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
)
def
forward
(
self
,
...
...
@@ -179,13 +177,12 @@ class GPT2Block(nn.Module):
return
hidden_states
class
GPT2Model
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
...
...
@@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module):
def
__init__
(
self
,
config
:
GPT2Config
,
cache_config
=
None
,
cache_config
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
transformer
=
GPT2Model
(
config
,
cache_config
,
quant_config
,
prefix
=
"transformer"
)
self
.
transformer
=
GPT2Model
(
config
,
cache_config
,
quant_config
,
prefix
=
"transformer"
)
self
.
lm_head
=
self
.
transformer
.
wte
self
.
logits_processor
=
LogitsProcessor
(
config
)
...
...
@@ -254,8 +250,6 @@ class GPT2LMHeadModel(nn.Module):
input_ids
,
hidden_states
,
self
.
lm_head
.
weight
,
forward_batch
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
params_dict
=
dict
(
self
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
weights
:
...
...
@@ -280,8 +274,8 @@ class GPT2LMHeadModel(nn.Module):
if
not
name
.
endswith
(
".weight"
):
continue
loaded_weight
=
loaded_weight
.
t
()
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
EntryClass
=
GPT2LMHeadModel
python/sglang/srt/server.py
View file @
c77c1e05
...
...
@@ -419,6 +419,7 @@ def launch_engine(
for
i
in
range
(
len
(
scheduler_pipe_readers
)):
scheduler_pipe_readers
[
i
].
recv
()
def
add_prometheus_middleware
(
app
:
FastAPI
):
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216
from
prometheus_client
import
CollectorRegistry
,
make_asgi_app
,
multiprocess
...
...
@@ -490,6 +491,7 @@ def launch_server(
finally
:
t
.
join
()
def
_set_prometheus_env
():
# Set prometheus multiprocess directory
# sglang uses prometheus multiprocess mode
...
...
@@ -506,6 +508,7 @@ def _set_prometheus_env():
os
.
environ
[
"PROMETHEUS_MULTIPROC_DIR"
]
=
prometheus_multiproc_dir
.
name
logger
.
debug
(
f
"PROMETHEUS_MULTIPROC_DIR:
{
os
.
environ
[
'PROMETHEUS_MULTIPROC_DIR'
]
}
"
)
def
_set_envs_and_config
(
server_args
:
ServerArgs
):
# Set global environments
os
.
environ
[
"TF_CPP_MIN_LOG_LEVEL"
]
=
"3"
...
...
@@ -763,8 +766,8 @@ class Engine:
# runtime server default log level is log
# offline engine works in scripts, so we set it to error
if
'
log_level
'
not
in
kwargs
:
kwargs
[
'
log_level
'
]
=
'
error
'
if
"
log_level
"
not
in
kwargs
:
kwargs
[
"
log_level
"
]
=
"
error
"
server_args
=
ServerArgs
(
*
args
,
**
kwargs
)
launch_engine
(
server_args
=
server_args
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment