/*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gpu

import (
	"context"
	"errors"
	"fmt"
	"net/http"
	"net/http/httptest"
	"strings"
	"testing"

	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
	dto "github.com/prometheus/client_model/go"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

// newFakeClient creates a fake Kubernetes client with the given objects
func newFakeClient(objs ...client.Object) client.Reader {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)
	return fake.NewClientBuilder().
		WithScheme(scheme).
		WithObjects(objs...).
		Build()
}

func TestDiscoverGPUs_SingleNode(t *testing.T) {
	ctx := context.Background()

	node := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-1",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "H100-SXM5-80GB",
				LabelGPUMemory:  "81920",
			},
		},
	}

	k8sClient := newFakeClient(node)

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	require.NoError(t, err)
	require.NotNil(t, gpuInfo)

	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
	assert.Equal(t, "h100_sxm", string(gpuInfo.System))
}

func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
	ctx := context.Background()

	// Multiple nodes with same GPU configuration
	node1 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-1",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "H100-SXM5-80GB",
				LabelGPUMemory:  "81920",
			},
		},
	}
	node2 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-2",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "H100-SXM5-80GB",
				LabelGPUMemory:  "81920",
			},
		},
	}

	k8sClient := newFakeClient(node1, node2)

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	require.NoError(t, err)
	require.NotNil(t, gpuInfo)

	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}

func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherGPUCountWins(t *testing.T) {
	ctx := context.Background()

	// Node with fewer GPUs
	node1 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-1",
			Labels: map[string]string{
				LabelGPUCount:   "4",
				LabelGPUProduct: "A100-SXM4-40GB",
				LabelGPUMemory:  "40960",
			},
		},
	}

	// Node with more GPUs (should win)
	node2 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-2",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "H100-SXM5-80GB",
				LabelGPUMemory:  "81920",
			},
		},
	}

	k8sClient := newFakeClient(node1, node2)

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	require.NoError(t, err)
	require.NotNil(t, gpuInfo)

	// Should prefer node with 8 GPUs over node with 4 GPUs
	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}

func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherVRAMWins(t *testing.T) {
	ctx := context.Background()

	// Node with same GPU count but less VRAM
	node1 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-1",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "A100-SXM4-40GB",
				LabelGPUMemory:  "40960",
			},
		},
	}

	// Node with same GPU count but more VRAM (should win)
	node2 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-2",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "H100-SXM5-80GB",
				LabelGPUMemory:  "81920",
			},
		},
	}

	k8sClient := newFakeClient(node1, node2)

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	require.NoError(t, err)
	require.NotNil(t, gpuInfo)

	// Should prefer node with higher VRAM when GPU count is equal
	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}

func TestDiscoverGPUs_MixedNodesWithAndWithoutGPUs(t *testing.T) {
	ctx := context.Background()

	// CPU-only node (no GPU labels)
	cpuNode := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name:   "cpu-node-1",
			Labels: map[string]string{},
		},
	}

	// GPU node
	gpuNode := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "gpu-node-1",
			Labels: map[string]string{
				LabelGPUCount:   "8",
				LabelGPUProduct: "H100-SXM5-80GB",
				LabelGPUMemory:  "81920",
			},
		},
	}

	k8sClient := newFakeClient(cpuNode, gpuNode)

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	require.NoError(t, err)
	require.NotNil(t, gpuInfo)

	// Should find the GPU node and ignore CPU-only node
	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
}

func TestDiscoverGPUs_NoNodes(t *testing.T) {
	ctx := context.Background()
	k8sClient := newFakeClient() // Empty cluster

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	assert.Error(t, err)
	assert.Nil(t, gpuInfo)
	assert.Contains(t, err.Error(), "no nodes found")
}

func TestDiscoverGPUs_NoGPUNodes(t *testing.T) {
	ctx := context.Background()

	// Only CPU nodes
	cpuNode1 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name:   "cpu-node-1",
			Labels: map[string]string{},
		},
	}
	cpuNode2 := &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: "cpu-node-2",
			Labels: map[string]string{
				"node-type": "cpu-only",
			},
		},
	}

	k8sClient := newFakeClient(cpuNode1, cpuNode2)

	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
	assert.Error(t, err)
	assert.Nil(t, gpuInfo)
	assert.Contains(t, err.Error(), "no nodes with NVIDIA GPU Feature Discovery labels found")
}

func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
	tests := []struct {
		name        string
		labels      map[string]string
		expectError bool
		errorMsg    string
	}{
		{
			name:        "missing GPU count",
			labels:      map[string]string{LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
			expectError: true,
			errorMsg:    LabelGPUCount,
		},
		{
			name:        "missing GPU product",
			labels:      map[string]string{LabelGPUCount: "8", LabelGPUMemory: "80000"},
			expectError: true,
			errorMsg:    LabelGPUProduct,
		},
		{
			name:        "missing GPU memory",
			labels:      map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100"},
			expectError: true,
			errorMsg:    LabelGPUMemory,
		},
		{
			name:        "invalid GPU count",
			labels:      map[string]string{LabelGPUCount: "invalid", LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
			expectError: true,
			errorMsg:    "invalid GPU count",
		},
		{
			name:        "invalid GPU memory",
			labels:      map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100", LabelGPUMemory: "invalid"},
			expectError: true,
			errorMsg:    "invalid GPU memory",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			node := &corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Name:   "test-node",
					Labels: tt.labels,
				},
			}

			gpuInfo, err := extractGPUInfoFromNode(node)
			if tt.expectError {
				assert.Error(t, err)
				assert.Nil(t, gpuInfo)
				if tt.errorMsg != "" {
					assert.Contains(t, err.Error(), tt.errorMsg)
				}
			} else {
				assert.NoError(t, err)
				assert.NotNil(t, gpuInfo)
			}
		})
	}
}

func TestInferHardwareSystem(t *testing.T) {
	tests := []struct {
		name     string
		input    string
		expected nvidiacomv1beta1.GPUSKUType
	}{
		// --- Empty / unknown ---
		{
			name:     "empty input",
			input:    "",
			expected: "",
		},
		{
			name:     "unknown gpu",
			input:    "random-gpu",
			expected: "",
		},

		// --- Blackwell ---
		{
			name:     "GB200 SXM",
			input:    "GB200-SXM",
			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
		},
		{
			name:     "GB200 HGX (implies SXM)",
			input:    "HGX GB200",
			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
		},
		{
			name:     "B200 SXM",
			input:    "B200 SXM",
			expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
		},

		// --- Hopper ---
		{
			name:     "H100 SXM",
			input:    "H100 SXM",
			expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
		},
		{
			name:     "H100 PCIe explicit",
			input:    "H100 PCIe",
			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
		},
		{
			name:     "H100 default PCIe",
			input:    "H100",
			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
		},
		{
			name:     "H200 SXM",
			input:    "H200 SXM",
			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
		},

		// --- Ampere ---
		{
			name:     "A100 SXM",
			input:    "A100-SXM",
			expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
		},
		{
			name:     "A100 PCIe",
			input:    "A100 PCIe",
			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
		},
		{
			name:     "A100 default PCIe",
			input:    "A100",
			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
		},

		// --- Ada ---
		{
			name:     "L40S",
			input:    "L40S",
			expected: nvidiacomv1beta1.GPUSKUTypeL40S,
		},
		{
			name:     "L40S should not match L40",
			input:    "L40S",
			expected: nvidiacomv1beta1.GPUSKUTypeL40S,
		},
		{
			name:     "L40",
			input:    "L40",
			expected: nvidiacomv1beta1.GPUSKUTypeL40,
		},
		{
			name:     "L4",
			input:    "L4",
			expected: nvidiacomv1beta1.GPUSKUTypeL4,
		},

		// --- Volta / Turing ---
		{
			name:     "V100 SXM",
			input:    "V100 SXM",
			expected: nvidiacomv1beta1.GPUSKUTypeV100SXM,
		},
		{
			name:     "V100 PCIe",
			input:    "V100 PCIe",
			expected: nvidiacomv1beta1.GPUSKUTypeV100PCIe,
		},
		{
			name:     "T4",
			input:    "T4",
			expected: nvidiacomv1beta1.GPUSKUTypeT4,
		},

		// --- AMD ---
		{
			name:     "MI300",
			input:    "MI300",
			expected: nvidiacomv1beta1.GPUSKUTypeMI300,
		},
		{
			name:     "MI250",
			input:    "MI250",
			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
		},
		{
			name:     "MI200",
			input:    "MI200",
			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
		},

		// --- Bare DCGM model names (no form factor suffix) ---
		// DCGM often reports "NVIDIA H200" / "NVIDIA B200" with system="" because
		// there is no SXM/HGX/DGX token in the string. GPUs that have no PCIe
		// variant must still resolve to their SXM SKU.
		{
			name:     "NVIDIA H200 bare (DCGM format, no SXM suffix)",
			input:    "NVIDIA H200",
			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
		},
		{
			name:     "NVIDIA B200 bare (DCGM format, no SXM suffix)",
			input:    "NVIDIA B200",
			expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
		},
		{
			name:     "NVIDIA GB200 bare (DCGM format, no SXM suffix)",
			input:    "NVIDIA GB200",
			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
		},
		{
			name:     "H200 bare without vendor prefix",
			input:    "H200",
			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
		},
		// H100/A100 still default to PCIe when no form factor indicator is present,
		// because those GPUs have a real PCIe variant.
		{
			name:     "H100 bare still defaults to PCIe (has PCIe variant)",
			input:    "H100",
			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
		},
		{
			name:     "A100 bare still defaults to PCIe (has PCIe variant)",
			input:    "A100",
			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
		},

		// --- Normalization tests ---
		{
			name:     "lowercase + spaces",
			input:    "h100 sxm",
			expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
		},
		{
			name:     "mixed case + dash",
			input:    "A100-sXm",
			expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
		},
		{
			name:     "with extra spaces",
			input:    "  H100   PCIe ",
			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := InferHardwareSystem(tt.input)
			if result != tt.expected {
				t.Errorf("InferHardwareSystem(%q) = %v, want %v",
					tt.input, result, tt.expected)
			}
		})
	}
}

func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {
	// Test that inference is case-insensitive
	variants := []string{
		"h100-sxm5-80gb",
		"H100-SXM5-80GB",
		"H100-sxm5-80GB",
		"h100-SXM5-80gb",
	}

	for _, variant := range variants {
		result := InferHardwareSystem(variant)
		assert.Equal(t, "h100_sxm", string(result), "Should handle case variations: %s", variant)
	}
}

func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
	// Test that spaces and dashes are normalized
	variants := []string{
		"H100-SXM5-80GB",
		"H100 SXM5 80GB",
		"H100SXM580GB",
		"H100-SXM5 80GB",
	}

	for _, variant := range variants {
		result := InferHardwareSystem(variant)
		assert.Equal(t, "h100_sxm", string(result), "Should normalize spaces/dashes: %s", variant)
	}
}

func TestNormalize(t *testing.T) {
	tests := []struct {
		name     string
		input    string
		expected string
	}{
		{
			name:     "basic lowercase",
			input:    "h100",
			expected: "H100",
		},
		{
			name:     "spaces removed",
			input:    "H100 SXM",
			expected: "H100SXM",
		},
		{
			name:     "dashes replaced and removed",
			input:    "H100-SXM",
			expected: "H100SXM",
		},
		{
			name:     "mixed spaces and dashes",
			input:    "A100 - SXM",
			expected: "A100SXM",
		},
		{
			name:     "extra whitespace",
			input:    "  H100   PCIe ",
			expected: "H100PCIE",
		},
		{
			name:     "complex string",
			input:    "h100-sxm5-80gb",
			expected: "H100SXM580GB",
		},
		{
			name:     "empty string",
			input:    "",
			expected: "",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := normalize(tt.input)
			if result != tt.expected {
				t.Errorf("normalize(%q) = %q, want %q",
					tt.input, result, tt.expected)
			}
		})
	}
}

func TestDetectFormFactor(t *testing.T) {
	tests := []struct {
		name     string
		input    string // already normalized
		expected string
	}{
		{
			name:     "detect SXM explicitly",
			input:    "H100SXM",
			expected: formFactorSXM,
		},
		{
			name:     "detect HGX implies SXM",
			input:    "HGXH100",
			expected: formFactorSXM,
		},
		{
			name:     "detect DGX implies SXM",
			input:    "DGXH100",
			expected: formFactorSXM,
		},
		{
			name:     "detect PCIe explicitly",
			input:    "H100PCIE",
			expected: formFactorPCIe,
		},
		{
			name:     "default to PCIe when unknown",
			input:    "H100",
			expected: formFactorPCIe,
		},
		{
			name:     "SXM wins over PCIe if both present",
			input:    "H100SXMPCIE",
			expected: formFactorSXM,
		},
		{
			name:     "random string defaults to PCIe",
			input:    "RANDOMGPU",
			expected: formFactorPCIe,
		},
		{
			name:     "empty string defaults to PCIe",
			input:    "",
			expected: formFactorPCIe,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := detectFormFactor(tt.input)
			if result != tt.expected {
				t.Errorf("detectFormFactor(%q) = %v, want %v",
					tt.input, result, tt.expected)
			}
		})
	}
}

func TestParseMetrics(t *testing.T) {
	ctx := context.Background()

	// Fake DCGM metrics for a node with 2 GPUs
	metricFamilies := map[string]*dto.MetricFamily{
		"DCGM_FI_DEV_GPU_TEMP": {
			Metric: []*dto.Metric{
				{
					Label: []*dto.LabelPair{
						{Name: strPtr("gpu"), Value: strPtr("0")},
						{Name: strPtr("modelName"), Value: strPtr("H100-SXM5-80GB")},
						{Name: strPtr("Hostname"), Value: strPtr("node1")},
					},
				},
				{
					Label: []*dto.LabelPair{
						{Name: strPtr("gpu"), Value: strPtr("1")},
						{Name: strPtr("modelName"), Value: strPtr("H100-SXM5-80GB")},
						{Name: strPtr("Hostname"), Value: strPtr("node1")},
					},
				},
			},
		},
		"DCGM_FI_DEV_FB_FREE": {
			Metric: []*dto.Metric{
				{Label: []*dto.LabelPair{{Name: strPtr("gpu"), Value: strPtr("0")}}, Gauge: &dto.Gauge{Value: float64Ptr(10000)}},
				{Label: []*dto.LabelPair{{Name: strPtr("gpu"), Value: strPtr("1")}}, Gauge: &dto.Gauge{Value: float64Ptr(12000)}},
			},
		},
		"DCGM_FI_DEV_FB_USED": {
			Metric: []*dto.Metric{
				{Label: []*dto.LabelPair{{Name: strPtr("gpu"), Value: strPtr("0")}}, Gauge: &dto.Gauge{Value: float64Ptr(5000)}},
				{Label: []*dto.LabelPair{{Name: strPtr("gpu"), Value: strPtr("1")}}, Gauge: &dto.Gauge{Value: float64Ptr(6000)}},
			},
		},
		"DCGM_FI_DEV_FB_RESERVED": {
			Metric: []*dto.Metric{
				{Label: []*dto.LabelPair{{Name: strPtr("gpu"), Value: strPtr("0")}}, Gauge: &dto.Gauge{Value: float64Ptr(0)}},
				{Label: []*dto.LabelPair{{Name: strPtr("gpu"), Value: strPtr("1")}}, Gauge: &dto.Gauge{Value: float64Ptr(0)}},
			},
		},
	}

	info, err := parseMetrics(ctx, metricFamilies)
	require.NoError(t, err)

	assert.Equal(t, "node1", info.NodeName)
	assert.Equal(t, 2, info.GPUsPerNode)
	assert.Equal(t, "H100-SXM5-80GB", info.Model)
	// maxVRAM: 12000 + 6000 + 0 = 18000
	assert.Equal(t, 18000, info.VRAMPerGPU)
	assert.False(t, info.MIGEnabled)
	assert.Empty(t, info.MIGProfiles)
}

func TestScrapeMetricsEndpoint(t *testing.T) {
	ctx := context.TODO()

	// Prepare a fake HTTP server to simulate Prometheus metrics
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		_, err := fmt.Fprintln(w, `# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `# TYPE DCGM_FI_DEV_GPU_TEMP gauge`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `DCGM_FI_DEV_GPU_TEMP{gpu="0",modelName="NVIDIA A100",Hostname="test-node"} 50`)
		require.NoError(t, err)

		_, err = fmt.Fprintln(w, `# HELP DCGM_FI_DEV_FB_FREE Framebuffer free`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `# TYPE DCGM_FI_DEV_FB_FREE gauge`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `DCGM_FI_DEV_FB_FREE{gpu="0",Hostname="test-node"} 10000`)
		require.NoError(t, err)

		_, err = fmt.Fprintln(w, `# HELP DCGM_FI_DEV_FB_USED Framebuffer used`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `# TYPE DCGM_FI_DEV_FB_USED gauge`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `DCGM_FI_DEV_FB_USED{gpu="0",Hostname="test-node"} 2000`)
		require.NoError(t, err)

		_, err = fmt.Fprintln(w, `# HELP DCGM_FI_DEV_FB_RESERVED Framebuffer reserved`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `# TYPE DCGM_FI_DEV_FB_RESERVED gauge`)
		require.NoError(t, err)
		_, err = fmt.Fprintln(w, `DCGM_FI_DEV_FB_RESERVED{gpu="0",Hostname="test-node"} 500`)
		require.NoError(t, err)
	}))
	defer server.Close()

	t.Run("successful scrape", func(t *testing.T) {
		info, err := ScrapeMetricsEndpoint(ctx, server.URL)
		if err != nil {
			t.Fatalf("expected no error, got %v", err)
		}
		if info == nil {
			t.Fatal("expected non-nil GPUInfo")
		}
	})

	t.Run("404 response", func(t *testing.T) {
		badServer := httptest.NewServer(http.NotFoundHandler())
		defer badServer.Close()

		_, err := ScrapeMetricsEndpoint(ctx, badServer.URL)
		expectedErr := fmt.Sprintf("metrics endpoint %s returned status 404", badServer.URL)
		if err == nil || err.Error() != expectedErr {
			t.Fatalf("expected %q, got %v", expectedErr, err)
		}
	})

	t.Run("invalid metrics", func(t *testing.T) {
		invalidServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			_, err := fmt.Fprintln(w, `not a prometheus format`)
			require.NoError(t, err)
		}))
		defer invalidServer.Close()

		_, err := ScrapeMetricsEndpoint(ctx, invalidServer.URL)
		if err == nil {
			t.Fatal("expected parse error, got nil")
		}
	})
}

func TestDiscoverGPUsFromDCGM_CacheHit(t *testing.T) {
	ctx := context.Background()

	pod := &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name:      "dcgm-pod",
			Namespace: "default",
			Labels: map[string]string{
				LabelApp: LabelValueNvidiaDCGMExporter,
			},
		},
		Status: corev1.PodStatus{
			Phase: corev1.PodRunning,
			PodIP: "10.0.0.1",
		},
	}

	scheme := runtime.NewScheme()
	require.NoError(t, corev1.AddToScheme(scheme))

	k8sClient := fake.NewClientBuilder().
		WithScheme(scheme).
		WithObjects(pod).
		Build()

	cache := NewGPUDiscoveryCache()

	callCount := 0

	mockScraper := func(ctx context.Context, endpoint string) (*GPUInfo, error) {
		callCount++
		return &GPUInfo{
			NodeName:    "node-a",
			GPUsPerNode: 4,
			Model:       "A100",
			VRAMPerGPU:  40960,
			MIGEnabled:  false,
			MIGProfiles: map[string]int{},
			System:      "DGX",
		}, nil
	}

	discovery := NewGPUDiscovery(mockScraper)

	// First call → should scrape
	info1, err := discovery.DiscoverGPUsFromDCGM(ctx, k8sClient, cache)
	require.NoError(t, err)
	require.NotNil(t, info1)
	require.Equal(t, 1, callCount)

	// Second call → should hit cache
	info2, err := discovery.DiscoverGPUsFromDCGM(ctx, k8sClient, cache)
	require.NoError(t, err)
	require.NotNil(t, info2)

	// Scrape should NOT be called again
	require.Equal(t, 1, callCount)

	require.Equal(t, info1, info2)
}

func TestDiscoverGPUsFromDCGMFiltered_MixedSKU(t *testing.T) {
	ctx := context.Background()

	// Two DCGM pods, one per node
	pods := []client.Object{
		&corev1.Pod{
			ObjectMeta: metav1.ObjectMeta{Name: "dcgm-h100", Namespace: "gpu-operator",
				Labels: map[string]string{LabelApp: LabelValueNvidiaDCGMExporter}},
			Status: corev1.PodStatus{Phase: corev1.PodRunning, PodIP: "10.0.0.1"},
		},
		&corev1.Pod{
			ObjectMeta: metav1.ObjectMeta{Name: "dcgm-a100", Namespace: "gpu-operator",
				Labels: map[string]string{LabelApp: LabelValueNvidiaDCGMExporter}},
			Status: corev1.PodStatus{Phase: corev1.PodRunning, PodIP: "10.0.0.2"},
		},
	}

	scheme := runtime.NewScheme()
	require.NoError(t, corev1.AddToScheme(scheme))
	k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(pods...).Build()

	// Return different GPU models per pod IP. H100 has more VRAM to win tie-breaking.
	mockScraper := func(ctx context.Context, endpoint string) (*GPUInfo, error) {
		if strings.Contains(endpoint, "10.0.0.1") {
			return &GPUInfo{NodeName: "node-h100", GPUsPerNode: 8, Model: "H100-SXM5-80GB", VRAMPerGPU: 81920}, nil
		}
		return &GPUInfo{NodeName: "node-a100", GPUsPerNode: 8, Model: "A100-SXM4-80GB", VRAMPerGPU: 40960}, nil
	}

	discovery := NewGPUDiscovery(mockScraper)

	t.Run("unfiltered selects best and counts only matching SKU", func(t *testing.T) {
		info, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, nil, "")
		require.NoError(t, err)
		assert.Equal(t, "h100_sxm", string(info.System))
		assert.Equal(t, 1, info.NodesWithGPUs, "should count only H100 nodes")
	})

	t.Run("filter by a100_sxm", func(t *testing.T) {
		info, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, nil, "a100_sxm")
		require.NoError(t, err)
		assert.Equal(t, "a100_sxm", string(info.System))
		assert.Equal(t, 1, info.NodesWithGPUs)
		assert.Equal(t, "A100-SXM4-80GB", info.Model)
	})

	t.Run("filter by nonexistent SKU", func(t *testing.T) {
		_, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, nil, "l40s")
		require.Error(t, err)
		assert.Contains(t, err.Error(), "no GPU nodes matching SKU")
	})

	t.Run("cache is per SKU", func(t *testing.T) {
		cache := NewGPUDiscoveryCache()
		info1, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, cache, "")
		require.NoError(t, err)
		info2, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, cache, "a100_sxm")
		require.NoError(t, err)
		assert.NotEqual(t, info1.System, info2.System, "different SKU filters should return different results")
	})
}

func TestDiscoverGPUsFromDCGM_GPUOperatorInstalled_DCgmNotEnabled(t *testing.T) {
	ctx := context.Background()

	gpuOperatorPod := &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name:      "gpu-operator-abc",
			Namespace: "gpu-operator",
			Labels: map[string]string{
				LabelApp: LabelValueGPUOperator,
			},
		},
		Status: corev1.PodStatus{
			Phase: corev1.PodRunning,
		},
	}

	scheme := runtime.NewScheme()
	require.NoError(t, corev1.AddToScheme(scheme))

	k8sClient := fake.NewClientBuilder().
		WithScheme(scheme).
		WithObjects(gpuOperatorPod).
		Build()

	cache := NewGPUDiscoveryCache()

	dummyScraper := func(ctx context.Context, endpoint string) (*GPUInfo, error) {
		return nil, fmt.Errorf("should not be called")
	}

	discovery := NewGPUDiscovery(dummyScraper)

	info, err := discovery.DiscoverGPUsFromDCGM(ctx, k8sClient, cache)

	require.Nil(t, info)
	require.Error(t, err)
	require.Contains(t, err.Error(), "DCGM is not enabled in the GPU Operator")
}

func TestDiscoverGPUsFromDCGM_NoGPUOperator_NoDCGM(t *testing.T) {
	ctx := context.Background()

	scheme := runtime.NewScheme()
	require.NoError(t, corev1.AddToScheme(scheme))

	k8sClient := fake.NewClientBuilder().
		WithScheme(scheme).
		Build()

	cache := NewGPUDiscoveryCache()

	dummyScraper := func(ctx context.Context, endpoint string) (*GPUInfo, error) {
		return nil, fmt.Errorf("should not be called")
	}

	discovery := NewGPUDiscovery(dummyScraper)

	info, err := discovery.DiscoverGPUsFromDCGM(ctx, k8sClient, cache)

	require.Nil(t, info)
	require.Error(t, err)

	require.True(
		t,
		strings.Contains(err.Error(), "gpu operator is not installed"),
	)
}

func TestListDCGMExporterPods(t *testing.T) {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)

	ctx := context.Background()

	tests := []struct {
		name        string
		objects     []client.Object
		expectCount int
		expectErr   bool
		errorClient bool
	}{
		{
			name: "pods found via different selectors",
			objects: []client.Object{
				&corev1.Pod{
					ObjectMeta: metav1.ObjectMeta{
						Name:      "pod1",
						Namespace: "ns1",
						Labels: map[string]string{
							LabelApp: LabelValueNvidiaDCGMExporter,
						},
					},
				},
				&corev1.Pod{
					ObjectMeta: metav1.ObjectMeta{
						Name:      "pod2",
						Namespace: "ns1",
						Labels: map[string]string{
							LabelAppKubernetesName: LabelValueDCGMExporter,
						},
					},
				},
			},
			expectCount: 2,
			expectErr:   false,
		},
		{
			name: "duplicate pods across selectors should dedupe",
			objects: []client.Object{
				&corev1.Pod{
					ObjectMeta: metav1.ObjectMeta{
						Name:      "pod1",
						Namespace: "ns1",
						Labels: map[string]string{
							LabelApp:               LabelValueDCGMExporter,
							LabelAppKubernetesName: LabelValueDCGMExporter,
						},
					},
				},
			},
			expectCount: 1,
			expectErr:   false,
		},
		{
			name:        "no pods found",
			objects:     []client.Object{},
			expectCount: 0,
			expectErr:   true,
		},
		{
			name:        "client list error",
			objects:     []client.Object{},
			expectCount: 0,
			expectErr:   true,
			errorClient: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {

			var k8sClient client.Reader

			if tt.errorClient {
				k8sClient = &errorListClient{}
			} else {
				k8sClient = fake.NewClientBuilder().
					WithScheme(scheme).
					WithObjects(tt.objects...).
					Build()
			}

			pods, err := listDCGMExporterPods(ctx, k8sClient)

			if tt.expectErr && err == nil {
				t.Fatalf("expected error but got nil")
			}
			if !tt.expectErr && err != nil {
				t.Fatalf("unexpected error: %v", err)
			}
			if len(pods) != tt.expectCount {
				t.Fatalf("expected %d pods, got %d", tt.expectCount, len(pods))
			}
		})
	}
}

//
// ---- Fake client that forces List error ----
//

type errorListClient struct {
	client.Reader
}

func (e *errorListClient) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
	return errors.New("forced list error")
}

// --- Helper functions ---
func strPtr(s string) *string       { return &s }
func float64Ptr(f float64) *float64 { return &f }

func TestGetCloudProviderInfo(t *testing.T) {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)

	tests := []struct {
		name    string
		node    corev1.Node
		want    string
		wantErr bool
	}{
		{
			name: "AKS via providerID",
			node: corev1.Node{
				Spec: corev1.NodeSpec{
					ProviderID: "azure:///subscriptions/xxx/resourceGroups/rg/providers/Microsoft.Compute/virtualMachines/vm1",
				},
			},
			want:    "aks",
			wantErr: false,
		},
		{
			name: "AWS via providerID",
			node: corev1.Node{
				Spec: corev1.NodeSpec{
					ProviderID: "aws:///us-west-2/i-0123456789abcdef0",
				},
			},
			want:    "aws",
			wantErr: false,
		},
		{
			name: "GCP via providerID",
			node: corev1.Node{
				Spec: corev1.NodeSpec{
					ProviderID: "gce://project/zone/instance",
				},
			},
			want:    "gcp",
			wantErr: false,
		},
		{
			name: "AKS via label",
			node: corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Labels: map[string]string{
						"kubernetes.azure.com/cluster": "mycluster",
					},
				},
			},
			want:    "aks",
			wantErr: false,
		},
		{
			name: "AWS via label",
			node: corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Labels: map[string]string{
						"eks.amazonaws.com/nodegroup": "ng-1",
					},
				},
			},
			want:    "aws",
			wantErr: false,
		},
		{
			name: "GCP via label",
			node: corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Labels: map[string]string{
						"cloud.google.com/gke-nodepool": "np-1",
					},
				},
			},
			want:    "gcp",
			wantErr: false,
		},
		{
			name: "Other node",
			node: corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Labels: map[string]string{
						"custom-label": "foo",
					},
				},
			},
			want:    "other",
			wantErr: false,
		},
		{
			name:    "No nodes",
			node:    corev1.Node{}, // will not add to client
			want:    "unknown",
			wantErr: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			ctx := context.TODO()
			var k8sClient client.Reader

			if tt.name != "No nodes" {
				k8sClient = fake.NewClientBuilder().
					WithScheme(scheme).
					WithObjects(&tt.node).
					Build()
			} else {
				k8sClient = fake.NewClientBuilder().
					WithScheme(scheme).
					Build()
			}

			got, err := GetCloudProviderInfo(ctx, k8sClient)
			if (err != nil) != tt.wantErr {
				t.Errorf("unexpected error: %v", err)
			}
			if got != tt.want {
				t.Errorf("got %q, want %q", got, tt.want)
			}
		})
	}
}

func TestDetectRDMAFromNode(t *testing.T) {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)

	tests := []struct {
		name        string
		node        *corev1.Node
		nodeName    string
		expectedOK  bool
		expectedTyp string
	}{
		{
			name:        "node not found",
			node:        nil,
			nodeName:    "missing-node",
			expectedOK:  false,
			expectedTyp: strNone,
		},
		{
			name: "rdma detected",
			node: &corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Name: "node-rdma",
					Labels: map[string]string{
						"nvidia.com/rdma.present": "true",
					},
				},
			},
			nodeName:    "node-rdma",
			expectedOK:  true,
			expectedTyp: "rdma",
		},
		{
			name: "sriov detected",
			node: &corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Name: "node-sriov",
					Labels: map[string]string{
						"feature.node.kubernetes.io/network-sriov.capable": "true",
					},
				},
			},
			nodeName:    "node-sriov",
			expectedOK:  true,
			expectedTyp: "sriov",
		},
		{
			name: "both rdma and sriov - rdma takes precedence",
			node: &corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Name: "node-both",
					Labels: map[string]string{
						"nvidia.com/rdma.present":                          "true",
						"feature.node.kubernetes.io/network-sriov.capable": "true",
					},
				},
			},
			nodeName:    "node-both",
			expectedOK:  true,
			expectedTyp: "rdma",
		},
		{
			name: "no relevant labels",
			node: &corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Name:   "node-none",
					Labels: map[string]string{},
				},
			},
			nodeName:    "node-none",
			expectedOK:  false,
			expectedTyp: strNone,
		},
		{
			name: "labels present but false",
			node: &corev1.Node{
				ObjectMeta: metav1.ObjectMeta{
					Name: "node-false",
					Labels: map[string]string{
						"nvidia.com/rdma.present":                          "false",
						"feature.node.kubernetes.io/network-sriov.capable": "false",
					},
				},
			},
			nodeName:    "node-false",
			expectedOK:  false,
			expectedTyp: strNone,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			var objs []runtime.Object
			if tt.node != nil {
				objs = append(objs, tt.node)
			}

			fakeClient := fake.NewClientBuilder().
				WithScheme(scheme).
				WithRuntimeObjects(objs...).
				Build()

			ok, typ := detectRDMAFromNode(context.TODO(), fakeClient, tt.nodeName)

			if ok != tt.expectedOK {
				t.Errorf("expected ok=%v, got %v", tt.expectedOK, ok)
			}
			if typ != tt.expectedTyp {
				t.Errorf("expected type=%s, got %s", tt.expectedTyp, typ)
			}
		})
	}
}