{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Embedding Model\n", "\n", "SGLang supports embedding models in the same way as completion models. Here are some example models:\n", "\n", "- [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct)\n", "- [Alibaba-NLP/gte-Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Launch A Server\n", "\n", "The following code is equivalent to running this in the shell:\n", "\n", "```bash\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", " --port 30010 --host 0.0.0.0 --is-embedding\n", "```\n", "\n", "Remember to add `--is-embedding` to the command." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:47:32.337369Z", "iopub.status.busy": "2024-11-01T02:47:32.337032Z", "iopub.status.idle": "2024-11-01T02:47:59.540926Z", "shell.execute_reply": "2024-11-01T02:47:59.539861Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n", "[2024-10-31 22:40:37] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=309155486, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n", "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n", "[2024-10-31 22:40:42 TP0] Init torch distributed begin.\n", "[2024-10-31 22:40:43 TP0] Load weight begin. avail mem=47.27 GB\n", "[2024-10-31 22:40:43 TP0] lm_eval is not installed, GPTQ may not be usable\n", "INFO 10-31 22:40:44 weight_utils.py:243] Using model weights format ['*.safetensors']\n", "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", " wait_for_server,\n", " terminate_process,\n", " print_highlight,\n", ")\n", "\n", "embedding_process = execute_shell_command(\n", " \"\"\"\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", " --port 30010 --host 0.0.0.0 --is-embedding\n", "\"\"\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30010\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use Curl" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:47:59.543958Z", "iopub.status.busy": "2024-11-01T02:47:59.543670Z", "iopub.status.idle": "2024-11-01T02:47:59.591699Z", "shell.execute_reply": "2024-11-01T02:47:59.590809Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-31 22:40:57 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-31 22:40:57] INFO: 127.0.0.1:51746 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import subprocess, json\n", "\n", "text = \"Once upon a time\"\n", "\n", "curl_text = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n", " -H \"Content-Type: application/json\" \\\n", " -H \"Authorization: Bearer None\" \\\n", " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n", "\n", "text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n", " \"embedding\"\n", "]\n", "\n", "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using OpenAI Compatible API" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:47:59.594229Z", "iopub.status.busy": "2024-11-01T02:47:59.594049Z", "iopub.status.idle": "2024-11-01T02:48:00.006233Z", "shell.execute_reply": "2024-11-01T02:48:00.005255Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-31 22:40:58 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-31 22:40:58] INFO: 127.0.0.1:51750 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import openai\n", "\n", "client = openai.Client(base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\")\n", "\n", "# Text embedding example\n", "response = client.embeddings.create(\n", " model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n", " input=text,\n", ")\n", "\n", "embedding = response.data[0].embedding[:10]\n", "print_highlight(f\"Text embedding (first 10): {embedding}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Input IDs\n", "\n", "SGLang also supports `input_ids` as input to get the embedding." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:48:00.008858Z", "iopub.status.busy": "2024-11-01T02:48:00.008689Z", "iopub.status.idle": "2024-11-01T02:48:01.872542Z", "shell.execute_reply": "2024-11-01T02:48:01.871573Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-31 22:41:00 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-31 22:41:00] INFO: 127.0.0.1:51762 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import json\n", "import os\n", "from transformers import AutoTokenizer\n", "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n", "input_ids = tokenizer.encode(text)\n", "\n", "curl_ids = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n", " -H \"Content-Type: application/json\" \\\n", " -H \"Authorization: Bearer None\" \\\n", " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n", "\n", "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n", " 0\n", "][\"embedding\"]\n", "\n", "print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:48:01.875204Z", "iopub.status.busy": "2024-11-01T02:48:01.874915Z", "iopub.status.idle": "2024-11-01T02:48:02.193734Z", "shell.execute_reply": "2024-11-01T02:48:02.192158Z" } }, "outputs": [], "source": [ "terminate_process(embedding_process)" ] } ], "metadata": { "kernelspec": { "display_name": "AlphaMeemory", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }