#!/bin/bash

# FIXME: Convert this script to README steps

export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TORCH_HOST=localhost
export VLLM_TORCH_PORT=36183
export VLLM_BASELINE_WORKERS=0
export VLLM_CONTEXT_WORKERS=1
export VLLM_GENERATE_WORKERS=1
export VLLM_BASELINE_TP_SIZE=1
export VLLM_CONTEXT_TP_SIZE=1
export VLLM_GENERATE_TP_SIZE=1
export VLLM_LOGGING_LEVEL=INFO
export VLLM_DATA_PLANE_BACKEND=nccl
export PYTHONUNBUFFERED=1

export NATS_HOST=localhost
export NATS_PORT=4223
export NATS_STORE="$(mktemp -d)"
export API_SERVER_HOST=localhost
export API_SERVER_PORT=8005


# Start NATS Server
echo "Flushing NATS store: ${NATS_STORE}..."
rm -r "${NATS_STORE}"

echo "Starting NATS Server..."
nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &


# Start API Server
echo "Starting LLM API Server..."
python3 -m llm.api_server \
  --tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
  --api-server-host ${API_SERVER_HOST} \
  --model-name llama \
  --api-server-port ${API_SERVER_PORT} &


# Start VLLM Worker 0
echo "Starting vLLM context workers..."
CUDA_VISIBLE_DEVICES=0 \
VLLM_WORKER_ID=0 \
python3 -m llm.vllm.deploy \
  --context-worker-count ${VLLM_CONTEXT_WORKERS} \
  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
  --kv-cache-dtype fp8 \
  --dtype auto \
  --worker-name llama \
  --disable-async-output-proc \
  --disable-log-stats \
  --max-model-len 1000 \
  --max-batch-size 10000 \
  --gpu-memory-utilization 0.9 \
  --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
  --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
  --log-dir "/tmp/vllm_logs" &

# Start VLLM Worker 1
echo "Starting vLLM generate workers..."
CUDA_VISIBLE_DEVICES=1 \
VLLM_WORKER_ID=1 \
python3 -m llm.vllm.deploy \
  --generate-worker-count ${VLLM_GENERATE_WORKERS} \
  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
  --kv-cache-dtype fp8 \
  --dtype auto \
  --worker-name llama \
  --disable-async-output-proc \
  --disable-log-stats \
  --max-model-len 1000 \
  --max-batch-size 10000 \
  --gpu-memory-utilization 0.9 \
  --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
  --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
  --log-dir "/tmp/vllm_logs" &

# NOTE: It may take more than a minute for the vllm worker to start up
# if the model weights aren't cached and need to be downloaded.
echo "Waiting for deployment to finish startup..."
echo "Once you see all ranks connected to the server, it should be ready..."
echo "Example output:"
echo "\tRank 0 connected to the server"
echo "\t..."
echo "\tRank 1 connected to the server"
sleep 120

# Make a Chat Completion Request
echo "Sending chat completions request..."
curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
  "model": "llama",
  "messages": [
    {"role": "system", "content": "What is the capital of France?"}
  ],
  "temperature": 0,
  "top_p": 0.95,
  "max_tokens": 25,
  "stream": true,
  "n": 1,
  "frequency_penalty": 0.0,
  "stop": []
}'