Unverified Commit 784da90e authored by ZichengMa's avatar ZichengMa Committed by GitHub
Browse files

feat: LMCache integration in newest ux (#2079)


Signed-off-by: default avatarZichengMa <zichengma1225@gmail.com>
Co-authored-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent 63fbf498
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# LMCache Dynamo One-Click Test Script
MODEL_URL=${1:-"Qwen/Qwen3-0.6B"}
NUM_SUBJECTS=${2:-15}
echo "🧪 LMCache Dynamo Complete Test"
echo "==============================="
echo "Model: $MODEL_URL"
echo "Number of subjects: $NUM_SUBJECTS"
echo ""
# Function to cleanup processes
cleanup() {
echo "🧹 Cleaning up running processes..."
# Kill any remaining dynamo processes
pkill -f "dynamo-run" || true
pkill -f "components/main.py" || true
# Stop docker services
docker compose -f ../../deploy/metrics/docker-compose.yml down 2>/dev/null || true
# Wait a moment for cleanup
sleep 2
}
# Set trap for cleanup on exit
trap cleanup EXIT
# Check if data exists
if [ ! -d "data/test" ] || [ ! -d "data/dev" ]; then
echo "📚 MMLU dataset not found, starting download..."
# Check if Python dependencies are installed
if ! python3 -c "import datasets, pandas" 2>/dev/null; then
echo "📦 Installing Python dependencies..."
pip install datasets pandas
fi
python3 download_mmlu.py
if [ $? -ne 0 ]; then
echo "❌ Data download failed, exiting"
exit 1
fi
else
echo "✅ MMLU dataset already exists"
fi
echo ""
echo "🔬 Step 1: Baseline Test (LMCache disabled)"
echo "==========================================="
# Run baseline test
echo "🚀 Starting baseline dynamo..."
timeout 600 ./deploy-1-dynamo.sh "$MODEL_URL" &
DEPLOY_PID=$!
# Wait for server to be ready
echo "⏳ Waiting for server to be ready..."
sleep 30
# Check if server is responding
max_attempts=30
attempt=0
until curl -s http://localhost:8080/v1/models > /dev/null 2>&1; do
attempt=$((attempt + 1))
if [ $attempt -gt $max_attempts ]; then
echo "❌ Server failed to start within timeout"
kill $DEPLOY_PID 2>/dev/null || true
exit 1
fi
echo "⏳ Waiting for server... (attempt $attempt/$max_attempts)"
sleep 10
done
echo "📊 Running baseline MMLU test..."
python3 1-mmlu-dynamo.py --model "$MODEL_URL" --number-of-subjects $NUM_SUBJECTS
if [ $? -ne 0 ]; then
echo "❌ Baseline test failed"
kill $DEPLOY_PID 2>/dev/null || true
exit 1
fi
echo "🛑 Stopping baseline services..."
kill $DEPLOY_PID 2>/dev/null || true
cleanup
sleep 5
echo ""
echo "🔬 Step 2: LMCache Test (LMCache enabled)"
echo "========================================="
# Run LMCache test
echo "🚀 Starting LMCache dynamo..."
timeout 600 ./deploy-2-dynamo.sh "$MODEL_URL" &
DEPLOY_PID=$!
# Wait for server to be ready
echo "⏳ Waiting for server to be ready..."
sleep 30
# Check if server is responding
attempt=0
until curl -s http://localhost:8080/v1/models > /dev/null 2>&1; do
attempt=$((attempt + 1))
if [ $attempt -gt $max_attempts ]; then
echo "❌ Server failed to start within timeout"
kill $DEPLOY_PID 2>/dev/null || true
exit 1
fi
echo "⏳ Waiting for server... (attempt $attempt/$max_attempts)"
sleep 10
done
echo "📊 Running LMCache MMLU test..."
python3 2-mmlu-dynamo.py --model "$MODEL_URL" --number-of-subjects $NUM_SUBJECTS
if [ $? -ne 0 ]; then
echo "❌ LMCache test failed"
kill $DEPLOY_PID 2>/dev/null || true
exit 1
fi
echo "🛑 Stopping LMCache services..."
kill $DEPLOY_PID 2>/dev/null || true
cleanup
echo ""
echo "📈 Step 3: Result Analysis"
echo "========================="
# Analyze results
python3 summarize_scores_dynamo.py
echo ""
echo "🎉 Test Complete!"
echo "================"
# Check if result files exist
baseline_file=$(ls dynamo-baseline-*.jsonl 2>/dev/null | head -1)
lmcache_file=$(ls dynamo-lmcache-*.jsonl 2>/dev/null | head -1)
if [ -n "$baseline_file" ] && [ -n "$lmcache_file" ]; then
echo "✅ Generated result files:"
echo " - Baseline test: $baseline_file"
echo " - LMCache test: $lmcache_file"
echo ""
echo "💡 If accuracy difference < 1%, LMCache functionality is correct"
else
echo "⚠️ Complete result files not found, please check if there were errors during testing"
fi
echo ""
echo "🔧 To re-run:"
echo " ./run_test.sh \"$MODEL_URL\" $NUM_SUBJECTS"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Summarize and compare MMLU scores between dynamo baseline and LMCache tests.
# Reference: https://github.com/LMCache/LMCache/blob/dev/.buildkite/correctness/summarize_scores.py
import glob
import json
from typing import Dict, List, Tuple
def load_jsonl_results(filename: str) -> Dict:
"""Load results from a JSONL file."""
results = {}
try:
with open(filename, "r") as f:
for line in f:
data = json.loads(line.strip())
results.update(data)
return results
except FileNotFoundError:
print(f"⚠️ File not found: {filename}")
return {}
except Exception as e:
print(f"❌ Error loading {filename}: {e}")
return {}
def find_result_files() -> Tuple[List[str], List[str]]:
"""Find baseline and LMCache result files."""
baseline_files = glob.glob("dynamo-baseline-*.jsonl")
lmcache_files = glob.glob("dynamo-lmcache-*.jsonl")
return sorted(baseline_files), sorted(lmcache_files)
def extract_model_name(filename: str) -> str:
"""Extract model name from filename."""
if filename.startswith("dynamo-baseline-"):
return filename[len("dynamo-baseline-") : -6] # Remove prefix and .jsonl
elif filename.startswith("dynamo-lmcache-"):
return filename[len("dynamo-lmcache-") : -6] # Remove prefix and .jsonl
return filename
def compare_results(baseline_results: Dict, lmcache_results: Dict, model_name: str):
"""Compare results between baseline and LMCache for a specific model."""
print(f"\n🔍 Model Comparison: {model_name}")
print("=" * 80)
if not baseline_results:
print("❌ Missing baseline test results")
return
if not lmcache_results:
print("❌ Missing LMCache test results")
return
# Compare total accuracy
baseline_total = baseline_results.get("total", {})
lmcache_total = lmcache_results.get("total", {})
if baseline_total and lmcache_total:
baseline_acc = baseline_total.get("accuracy", 0)
lmcache_acc = lmcache_total.get("accuracy", 0)
diff = abs(baseline_acc - lmcache_acc)
print("📊 Overall Accuracy:")
print(f" Baseline (no LMCache): {baseline_acc:.4f}")
print(f" LMCache: {lmcache_acc:.4f}")
print(f" Difference: {diff:.4f}")
if diff < 0.01: # 1% threshold
print(" ✅ Results consistent (difference < 1%)")
else:
print(" ⚠️ Large difference (difference >= 1%)")
# Compare by subject
print("\n📚 Subject-wise Comparison:")
subjects_baseline = set(baseline_results.keys()) - {"total"}
subjects_lmcache = set(lmcache_results.keys()) - {"total"}
common_subjects = subjects_baseline & subjects_lmcache
missing_in_baseline = subjects_lmcache - subjects_baseline
missing_in_lmcache = subjects_baseline - subjects_lmcache
if missing_in_baseline:
print(f"⚠️ Subjects missing in baseline test: {missing_in_baseline}")
if missing_in_lmcache:
print(f"⚠️ Subjects missing in LMCache test: {missing_in_lmcache}")
# Detailed comparison for common subjects
large_diff_subjects = []
for subject in sorted(common_subjects):
baseline_acc = baseline_results[subject].get("accuracy", 0)
lmcache_acc = lmcache_results[subject].get("accuracy", 0)
diff = abs(baseline_acc - lmcache_acc)
status = "✅" if diff < 0.05 else "⚠️" # 5% threshold for individual subjects
if diff >= 0.05:
large_diff_subjects.append((subject, baseline_acc, lmcache_acc, diff))
print(
f" {status} {subject:25s}: baseline={baseline_acc:.3f}, LMCache={lmcache_acc:.3f}, diff={diff:.3f}"
)
# Highlight subjects with large differences
if large_diff_subjects:
print("\n⚠️ Subjects with large differences (> 5%):")
for subject, baseline_acc, lmcache_acc, diff in large_diff_subjects:
print(
f" {subject}: baseline={baseline_acc:.3f}, LMCache={lmcache_acc:.3f}, diff={diff:.3f}"
)
def main():
print("🧮 Dynamo LMCache MMLU Result Comparison Tool")
print("=" * 80)
# Find all result files
baseline_files, lmcache_files = find_result_files()
if not baseline_files and not lmcache_files:
print("❌ No result files found")
print("Please ensure you have run the test scripts and generated result files:")
print(" - dynamo-baseline-*.jsonl")
print(" - dynamo-lmcache-*.jsonl")
return
print("📁 Files found:")
print(f" Baseline test results: {len(baseline_files)} files")
for f in baseline_files:
print(f" - {f}")
print(f" LMCache test results: {len(lmcache_files)} files")
for f in lmcache_files:
print(f" - {f}")
# Group files by model
baseline_by_model = {extract_model_name(f): f for f in baseline_files}
lmcache_by_model = {extract_model_name(f): f for f in lmcache_files}
all_models = set(baseline_by_model.keys()) | set(lmcache_by_model.keys())
if not all_models:
print("❌ No valid model results found")
return
# Compare results for each model
overall_consistent = True
for model in sorted(all_models):
baseline_file = baseline_by_model.get(model)
lmcache_file = lmcache_by_model.get(model)
if not baseline_file:
print(f"\n⚠️ Model {model} missing baseline test results")
overall_consistent = False
continue
if not lmcache_file:
print(f"\n⚠️ Model {model} missing LMCache test results")
overall_consistent = False
continue
# Load and compare results
baseline_results = load_jsonl_results(baseline_file)
lmcache_results = load_jsonl_results(lmcache_file)
compare_results(baseline_results, lmcache_results, model)
# Check if this model passed consistency check
baseline_total = baseline_results.get("total", {}).get("accuracy", 0)
lmcache_total = lmcache_results.get("total", {}).get("accuracy", 0)
diff = abs(baseline_total - lmcache_total)
if diff >= 0.01: # 1% threshold
overall_consistent = False
# Final summary
print("\n" + "=" * 80)
print("📋 Final Summary:")
if overall_consistent:
print("✅ LMCache functionality is correct for all tested models")
print(" Baseline and LMCache test results are consistent (difference < 1%)")
else:
print("❌ LMCache functionality issues detected")
print(
" Some models show inconsistent results between baseline and LMCache tests"
)
print(" Please check the specific differences above")
print("\n💡 Recommendations:")
print(" - If difference is small (<1%), it's usually acceptable")
print(
" - If difference is large (>5%), check LMCache configuration or implementation"
)
print(" - Re-run tests to verify result reproducibility")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment