"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "2b465570e6dd327e8422ef9c87e9b2b1454ceaed"
Unverified Commit d07288c5 authored by Dr. Stefan Schimanski's avatar Dr. Stefan Schimanski Committed by GitHub
Browse files

fix(operator): run GPU discovery when any hardware field is missing (#8267)


Signed-off-by: default avatarDr. Stefan Schimanski <sschimanski@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent bae41d44
......@@ -203,21 +203,23 @@ class FeaturesSpec(BaseModel):
class HardwareSpec(BaseModel):
"""HardwareSpec describes the hardware resources available for profiling and deployment. These fields are typically auto-filled by the operator from cluster discovery."""
"""HardwareSpec describes the GPU hardware for profiling and deployment. All fields are auto-detected from cluster GPU nodes when omitted (requires cluster-wide mode with GPU discovery enabled). gpuSku is a selector (restricts which nodes are considered); the other fields are pure overrides passed to the profiler. If all four fields are set, discovery is skipped."""
gpuSku: Optional[GPUSKUType] = Field(
default=None,
description="GPUSKU is the AIC hardware system identifier for the GPU. When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.",
description="GPUSKU selects the GPU type to target. When omitted, auto-detected by selecting the GPU with the highest node count, then highest VRAM. In mixed-GPU clusters, set this to choose which GPU type to use. Discovery and totalGpus are then restricted to nodes matching this SKU.",
)
vramMb: Optional[float] = Field(
default=None, description="VRAMMB is the VRAM per GPU in MiB."
default=None,
description="VRAMMB is the VRAM per GPU in MiB. When omitted, auto-detected from cluster GPU nodes.",
)
totalGpus: Optional[int] = Field(
default=None,
description="TotalGPUs is the total number of GPUs available in the cluster.",
description="TotalGPUs is the GPU budget for profiling and deployment. The profiler uses this to determine parallelism and replica count. When omitted, computed by counting GPUs on discovered nodes (filtered by gpuSku when set), temporarily capped at 32 to limit profiler search space. This cap may be removed in a future release. Set this field explicitly to override.",
)
numGpusPerNode: Optional[int] = Field(
default=None, description="NumGPUsPerNode is the number of GPUs per node."
default=None,
description="NumGPUsPerNode is the number of GPUs per node. When omitted, auto-detected from cluster GPU nodes.",
)
interconnect: Optional[str] = Field(
default=None,
......
......@@ -610,8 +610,11 @@ spec:
- mi200
- mi300
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
GPUSKU selects the GPU type to target.
When omitted, auto-detected by selecting the GPU with the highest
node count, then highest VRAM. In mixed-GPU clusters, set this to
choose which GPU type to use. Discovery and totalGpus are then
restricted to nodes matching this SKU.
type: string
interconnect:
description: |-
......@@ -635,7 +638,9 @@ spec:
Example values: "pcie", "nvlink". Other values may be accepted but may not be auto-detected.
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
description: |-
NumGPUsPerNode is the number of GPUs per node.
When omitted, auto-detected from cluster GPU nodes.
format: int32
type: integer
rdma:
......@@ -660,11 +665,19 @@ spec:
conservative defaults / fallback transports.
type: boolean
totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster.
description: |-
TotalGPUs is the GPU budget for profiling and deployment.
The profiler uses this to determine parallelism and replica count.
When omitted, computed by counting GPUs on discovered nodes
(filtered by gpuSku when set), temporarily capped at 32 to
limit profiler search space. This cap may be removed in a future
release. Set this field explicitly to override.
format: int32
type: integer
vramMb:
description: VRAMMB is the VRAM per GPU in MiB.
description: |-
VRAMMB is the VRAM per GPU in MiB.
When omitted, auto-detected from cluster GPU nodes.
type: number
type: object
image:
......
......@@ -333,24 +333,38 @@ type FeaturesSpec struct {
Mocker *MockerSpec `json:"mocker,omitempty"`
}
// HardwareSpec describes the hardware resources available for profiling and deployment.
// These fields are typically auto-filled by the operator from cluster discovery.
// HardwareSpec describes the GPU hardware for profiling and deployment.
// All fields are auto-detected from cluster GPU nodes when omitted
// (requires cluster-wide mode with GPU discovery enabled).
// gpuSku is a selector (restricts which nodes are considered);
// the other fields are pure overrides passed to the profiler.
// If all four fields are set, discovery is skipped.
type HardwareSpec struct {
// GPUSKU is the AIC hardware system identifier for the GPU.
// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
// GPUSKU selects the GPU type to target.
// When omitted, auto-detected by selecting the GPU with the highest
// node count, then highest VRAM. In mixed-GPU clusters, set this to
// choose which GPU type to use. Discovery and totalGpus are then
// restricted to nodes matching this SKU.
// +optional
// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
// VRAMMB is the VRAM per GPU in MiB.
// When omitted, auto-detected from cluster GPU nodes.
// +optional
VRAMMB *float64 `json:"vramMb,omitempty"`
// TotalGPUs is the total number of GPUs available in the cluster.
// TotalGPUs is the GPU budget for profiling and deployment.
// The profiler uses this to determine parallelism and replica count.
// When omitted, computed by counting GPUs on discovered nodes
// (filtered by gpuSku when set), temporarily capped at 32 to
// limit profiler search space. This cap may be removed in a future
// release. Set this field explicitly to override.
// +optional
TotalGPUs *int32 `json:"totalGpus,omitempty"`
// NumGPUsPerNode is the number of GPUs per node.
// When omitted, auto-detected from cluster GPU nodes.
// +optional
NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"`
// Interconnect describes the primary GPU-to-GPU interconnect *within a node*.
......
......@@ -610,8 +610,11 @@ spec:
- mi200
- mi300
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
GPUSKU selects the GPU type to target.
When omitted, auto-detected by selecting the GPU with the highest
node count, then highest VRAM. In mixed-GPU clusters, set this to
choose which GPU type to use. Discovery and totalGpus are then
restricted to nodes matching this SKU.
type: string
interconnect:
description: |-
......@@ -635,7 +638,9 @@ spec:
Example values: "pcie", "nvlink". Other values may be accepted but may not be auto-detected.
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
description: |-
NumGPUsPerNode is the number of GPUs per node.
When omitted, auto-detected from cluster GPU nodes.
format: int32
type: integer
rdma:
......@@ -660,11 +665,19 @@ spec:
conservative defaults / fallback transports.
type: boolean
totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster.
description: |-
TotalGPUs is the GPU budget for profiling and deployment.
The profiler uses this to determine parallelism and replica count.
When omitted, computed by counting GPUs on discovered nodes
(filtered by gpuSku when set), temporarily capped at 32 to
limit profiler search space. This cap may be removed in a future
release. Set this field explicitly to override.
format: int32
type: integer
vramMb:
description: VRAMMB is the VRAM per GPU in MiB.
description: |-
VRAMMB is the VRAM per GPU in MiB.
When omitted, auto-detected from cluster GPU nodes.
type: number
type: object
image:
......
......@@ -1046,13 +1046,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Check if user provided hardware info in the typed spec
hasManualConfig := dgdr.Spec.Hardware != nil && (dgdr.Spec.Hardware.GPUSKU != "" ||
dgdr.Spec.Hardware.VRAMMB != nil ||
dgdr.Spec.Hardware.NumGPUsPerNode != nil)
// If manual config is provided, validation passes
if hasManualConfig {
// All four hardware fields set — discovery not needed.
hw := dgdr.Spec.Hardware
if hw != nil && hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil && hw.TotalGPUs != nil {
return nil
}
......@@ -1064,9 +1060,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
"\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" +
"\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" +
"\n\n2. Add hardware config to spec.hardware:" +
"\n gpuSku: \"h100_sxm\"" +
"\n vramMb: 81920" +
"\n numGpusPerNode: 8" +
"\n gpuSku: \"H100-SXM5-80GB\"" +
"\n vramMb: 81920")
"\n totalGpus: 8")
}
_, err := r.GPUDiscovery.DiscoverGPUsFromDCGM(ctx, r.APIReader, r.GPUDiscoveryCache)
......@@ -1077,7 +1074,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
// Refine the logger message
reason := GetGPUDiscoveryFailureReason(err)
logger.Info("GPU discovery not available", "reason", reason, "error", err.Error())
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode")
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode, spec.hardware.totalGpus")
}
// GetGPUDiscoveryFailureReason classifies a GPU discovery error and
......@@ -1166,9 +1163,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
// Enrich hardware from GPU discovery before marshalling the spec.
// This fills in gpuSku, vramMb, numGpusPerNode if the user didn't set them.
// This fills in any missing hardware fields (gpuSku, vramMb, numGpusPerNode, totalGpus).
if err := r.enrichHardwareFromDiscovery(ctx, dgdr); err != nil {
logger.Info("GPU discovery not available, proceeding without enrichment", "reason", err.Error())
return err
}
// Use SyncResource to create/update the job
......@@ -1443,39 +1440,30 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
}
hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
// All fields already provided — nothing to discover.
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil && hw.TotalGPUs != nil {
return nil
}
var gpuInfo *gpu.GPUInfo
// Run discovery to fill in any fields the user didn't set.
logger := log.FromContext(ctx)
// Check if user provided hardware info in the typed spec
hasManualConfig := dgdr.Spec.Hardware != nil && (dgdr.Spec.Hardware.GPUSKU != "" ||
dgdr.Spec.Hardware.VRAMMB != nil ||
dgdr.Spec.Hardware.NumGPUsPerNode != nil)
if !hasManualConfig {
logger.Info("Attempting GPU discovery for profiling job")
discoveredInfo, err := r.GPUDiscovery.DiscoverGPUsFromDCGM(ctx, r.APIReader, r.GPUDiscoveryCache)
if err != nil {
// This path is expected for namespace-restricted operators without node read permissions
// Refine the logger message
reason := GetGPUDiscoveryFailureReason(err)
logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
"reason", reason, "error", err.Error())
return err
} else {
gpuInfo = discoveredInfo
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU,
"system", gpuInfo.System,
"cloudprovider", gpuInfo.CloudProvider)
}
logger.Info("Attempting GPU discovery for profiling job")
gpuInfo, err := r.GPUDiscovery.DiscoverGPUsFromDCGMFiltered(ctx, r.APIReader, r.GPUDiscoveryCache, hw.GPUSKU)
if err != nil {
reason := GetGPUDiscoveryFailureReason(err)
logger.Info("GPU discovery not available", "reason", reason, "error", err.Error())
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode, spec.hardware.totalGpus")
}
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU,
"system", gpuInfo.System,
"cloudprovider", gpuInfo.CloudProvider)
if hw.GPUSKU == "" {
if gpuInfo.System != "" {
hw.GPUSKU = gpuInfo.System
......
......@@ -1436,7 +1436,7 @@ spec:
NodesWithGPUs: 1,
}
cache := gpu.NewGPUDiscoveryCache()
cache.Set(mockGPU, 10*time.Minute)
cache.Set("", mockGPU, 10*time.Minute)
reconciler.GPUDiscoveryCache = cache
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
reconciler.APIReader = k8sClient
......@@ -1489,6 +1489,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](4),
GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
VRAMMB: ptr.To(40960.0),
TotalGPUs: ptr.To[int32](4),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1561,7 +1562,7 @@ spec:
NodesWithGPUs: 1,
}
cache := gpu.NewGPUDiscoveryCache()
cache.Set(mockGPU, 10*time.Minute)
cache.Set("", mockGPU, 10*time.Minute)
reconciler.GPUDiscoveryCache = cache
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
reconciler.APIReader = k8sClient
......@@ -1598,7 +1599,10 @@ spec:
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
NumGPUsPerNode: ptr.To[int32](8),
TotalGPUs: ptr.To[int32](8),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1685,7 +1689,7 @@ spec:
NodesWithGPUs: 1,
}
cache := gpu.NewGPUDiscoveryCache()
cache.Set(mockGPU, 10*time.Minute)
cache.Set("", mockGPU, 10*time.Minute)
reconciler.GPUDiscoveryCache = cache
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
reconciler.APIReader = k8sClient
......@@ -2307,7 +2311,7 @@ spec:
}
})
It("Should validate typed hardware fields without blob parsing", func() {
It("Should fail validation with partial hardware when discovery is unavailable", func() {
ctx := context.Background()
dgdrName := "test-dgdr-typed-hw"
namespace := defaultNamespace
......@@ -2334,7 +2338,58 @@ spec:
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile — partial hardware (GPUSKU only) should pass validation
// Reconcile — partial hardware without discovery should fail validation
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
})
It("Should pass validation with partial hardware when discovery is available", func() {
ctx := context.Background()
dgdrName := "test-dgdr-partial-hw-discovery"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Mock GPU discovery so validation and enrichment succeed.
mockGPU := &gpu.GPUInfo{
GPUsPerNode: 8,
VRAMPerGPU: 81920,
System: "a100_sxm",
NodesWithGPUs: 1,
}
cache := gpu.NewGPUDiscoveryCache()
cache.Set("", mockGPU, 10*time.Minute)
cache.Set("a100_sxm", mockGPU, 10*time.Minute)
reconciler.GPUDiscoveryCache = cache
reconciler.GPUDiscovery = gpu.NewGPUDiscovery(nil)
reconciler.APIReader = k8sClient
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
......
......@@ -19,7 +19,6 @@ package controller
import (
"context"
"fmt"
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
......@@ -33,15 +32,12 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/fake"
gpupkg "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
"k8s.io/utils/ptr"
)
func newFakeReconciler(nodes ...*corev1.Node) *DynamoGraphDeploymentRequestReconciler {
func newFakeReconciler(objs ...client.Object) *DynamoGraphDeploymentRequestReconciler {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
objs := make([]client.Object, len(nodes))
for i, n := range nodes {
objs[i] = n
}
fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
return &DynamoGraphDeploymentRequestReconciler{
Client: fakeClient,
......@@ -50,90 +46,132 @@ func newFakeReconciler(nodes ...*corev1.Node) *DynamoGraphDeploymentRequestRecon
}
}
func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
return &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
gpupkg.LabelGPUCount: intStr(gpuCount),
gpupkg.LabelGPUProduct: product,
gpupkg.LabelGPUMemory: intStr(vramMiB),
},
},
func dcgmPod(name, ip string) *corev1.Pod {
return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "gpu-operator",
Labels: map[string]string{gpupkg.LabelApp: gpupkg.LabelValueNvidiaDCGMExporter}},
Status: corev1.PodStatus{Phase: corev1.PodRunning, PodIP: ip},
}
}
func intStr(n int) string {
return fmt.Sprintf("%d", n)
}
// TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier is the regression test for the
// bug where GPUSKU was set to the raw GFD product name (e.g. "NVIDIA-B200") instead of
// the AIC system identifier (e.g. "b200_sxm"), causing AIC support checks to always fail
// and forcing every model/backend to fall back to naive config generation.
func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
func TestEnrichHardwareFromDiscovery(t *testing.T) {
tests := []struct {
name string
gfdProduct string // raw GFD label value
expectedGPUSKU nvidiacomv1beta1.GPUSKUType // what the profiler needs
name string
// Input hardware spec (nil fields = not set by user).
hardware *nvidiacomv1beta1.HardwareSpec
// Discovery mock: what DCGM returns. Nil means no discovery available.
discoveredGPU *gpupkg.GPUInfo
// Expected outcome.
wantErr string // non-empty means error expected, substring match
wantGPUSKU string
wantVRAM float64
wantGPUsNode int32
wantTotalGPUs int32
}{
{
name: "B200 GFD label maps to AIC system identifier",
gfdProduct: "NVIDIA-B200",
expectedGPUSKU: "b200_sxm",
name: "all four fields set, discovery skipped",
hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "h100_sxm", VRAMMB: ptr.To(81920.0),
NumGPUsPerNode: ptr.To(int32(8)), TotalGPUs: ptr.To(int32(16)),
},
wantGPUSKU: "h100_sxm", wantVRAM: 81920, wantGPUsNode: 8, wantTotalGPUs: 16,
},
{
name: "nothing set, full discovery",
discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "H100-SXM5-80GB", VRAMPerGPU: 81920},
wantGPUSKU: "h100_sxm", wantVRAM: 81920, wantGPUsNode: 8, wantTotalGPUs: 8,
},
{
name: "nothing set, V100 discovered",
discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "Tesla-V100-SXM2-16GB", VRAMPerGPU: 16384},
wantGPUSKU: "v100_sxm", wantVRAM: 16384, wantGPUsNode: 8, wantTotalGPUs: 8,
},
{
name: "nothing set, unknown GPU falls back to model name",
discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 4, Model: "FutureGPU-X1000", VRAMPerGPU: 65536},
wantGPUSKU: "FutureGPU-X1000", wantVRAM: 65536, wantGPUsNode: 4, wantTotalGPUs: 4,
},
{
name: "only totalGpus missing, discovery fills it",
hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "b200_sxm", VRAMMB: ptr.To(141312.0), NumGPUsPerNode: ptr.To(int32(8)),
},
discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "B200-SXM-180GB", VRAMPerGPU: 141312},
wantGPUSKU: "b200_sxm", wantVRAM: 141312, wantGPUsNode: 8, wantTotalGPUs: 8,
},
{
name: "only gpuSku missing, discovery fills it",
hardware: &nvidiacomv1beta1.HardwareSpec{
VRAMMB: ptr.To(81920.0), NumGPUsPerNode: ptr.To(int32(8)), TotalGPUs: ptr.To(int32(16)),
},
discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "H200-SXM5-141GB", VRAMPerGPU: 141312},
wantGPUSKU: "h200_sxm", wantVRAM: 81920, wantGPUsNode: 8, wantTotalGPUs: 16, // user overrides win
},
{
name: "vramMb and numGpusPerNode override discovery",
hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "a100_sxm", VRAMMB: ptr.To(40960.0), NumGPUsPerNode: ptr.To(int32(4)),
},
discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "A100-SXM4-80GB", VRAMPerGPU: 81920},
wantGPUSKU: "a100_sxm", wantVRAM: 40960, wantGPUsNode: 4, wantTotalGPUs: 8,
},
{
name: "no fields set, discovery fails",
wantErr: "auto-discovery failed",
},
{
name: "H200 GFD label maps to AIC system identifier",
gfdProduct: "NVIDIA-H200-SXM5-141GB",
expectedGPUSKU: "h200_sxm",
name: "three fields set, discovery fails",
hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "h100_sxm", VRAMMB: ptr.To(81920.0), NumGPUsPerNode: ptr.To(int32(8)),
},
wantErr: "auto-discovery failed",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", tt.gfdProduct, 8, 141312))
vram := float64(141312)
gpus := int32(8)
var r *DynamoGraphDeploymentRequestReconciler
if tt.discoveredGPU != nil {
// Set up a DCGM pod and mock scraper so discovery works.
r = newFakeReconciler(dcgmPod("dcgm-exporter", "10.0.0.1"))
r.GPUDiscovery = gpupkg.NewGPUDiscovery(func(ctx context.Context, endpoint string) (*gpupkg.GPUInfo, error) {
return tt.discoveredGPU, nil
})
r.GPUDiscoveryCache = gpupkg.NewGPUDiscoveryCache()
} else if tt.wantErr != "" {
// No discovery — will fail if discovery is attempted.
r = newFakeReconciler()
} else {
// All fields set — discovery not needed, no mock required.
r = newFakeReconciler()
}
hw := tt.hardware
if hw == nil {
hw = &nvidiacomv1beta1.HardwareSpec{}
}
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: tt.expectedGPUSKU,
VRAMMB: &vram,
NumGPUsPerNode: &gpus,
},
Hardware: hw,
},
}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
if tt.wantErr != "" {
require.Error(t, err)
assert.Contains(t, err.Error(), tt.wantErr)
return
}
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, string(tt.expectedGPUSKU), string(dgdr.Spec.Hardware.GPUSKU),
"GPUSKU should be the AIC system identifier, not the raw GFD product name %q", tt.gfdProduct)
assert.Equal(t, tt.wantGPUSKU, string(dgdr.Spec.Hardware.GPUSKU))
assert.Equal(t, tt.wantVRAM, *dgdr.Spec.Hardware.VRAMMB)
assert.Equal(t, tt.wantGPUsNode, *dgdr.Spec.Hardware.NumGPUsPerNode)
assert.Equal(t, tt.wantTotalGPUs, *dgdr.Spec.Hardware.TotalGPUs)
})
}
}
// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
vram := float64(16384)
gpus := int32(8)
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "Tesla-V100-SXM2-16GB",
VRAMMB: &vram,
NumGPUsPerNode: &gpus,
},
},
}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
"Unknown GPU should fall back to raw model name")
}
......@@ -164,11 +164,19 @@ type GPUInfo struct {
}
type ScrapeMetricsFunc func(ctx context.Context, endpoint string) (*GPUInfo, error)
type GPUDiscoveryCache struct {
mu sync.RWMutex
type gpuCacheEntry struct {
value *GPUInfo
expiresAt time.Time
}
// GPUDiscoveryCache caches discovery results keyed by SKU filter.
// Bounded by the GPUSKUType enum (≤7 values incl. empty for unfiltered).
type GPUDiscoveryCache struct {
mu sync.RWMutex
entries map[nvidiacomv1beta1.GPUSKUType]gpuCacheEntry
}
type GPUDiscovery struct {
Scraper ScrapeMetricsFunc
}
......@@ -181,24 +189,28 @@ func NewGPUDiscovery(scraper ScrapeMetricsFunc) *GPUDiscovery {
// NewGPUDiscoveryCache creates a new GPUDiscoveryCache instance.
//
// The cache stores a single discovered GPUInfo value with an expiration time.
// It is safe for concurrent use and is intended to reduce repeated DCGM
// scraping during reconciliation loops.
// The cache stores discovered GPUInfo values keyed by SKU filter with an
// expiration time. It is safe for concurrent use and is intended to reduce
// repeated DCGM scraping during reconciliation loops.
func NewGPUDiscoveryCache() *GPUDiscoveryCache {
return &GPUDiscoveryCache{}
return &GPUDiscoveryCache{
entries: make(map[nvidiacomv1beta1.GPUSKUType]gpuCacheEntry),
}
}
// Get returns the cached GPUInfo if it exists and has not expired.
// Get returns the cached GPUInfo for the given SKU filter if it exists and
// has not expired.
//
// The boolean return value indicates whether a valid cached value was found.
// If the cache is empty or expired, it returns (nil, false).
//
// This method is safe for concurrent use.
func (c *GPUDiscoveryCache) Get() (*GPUInfo, bool) {
func (c *GPUDiscoveryCache) Get(sku nvidiacomv1beta1.GPUSKUType) (*GPUInfo, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
if time.Now().Before(c.expiresAt) && c.value != nil {
return c.value, true
e, ok := c.entries[sku]
c.mu.RUnlock()
if ok && time.Now().Before(e.expiresAt) && e.value != nil {
return e.value, true
}
return nil, false
}
......@@ -209,27 +221,38 @@ func (c *GPUDiscoveryCache) Get() (*GPUInfo, bool) {
// After expiration, Get will return (nil, false) until a new value is set.
//
// This method is safe for concurrent use.
func (c *GPUDiscoveryCache) Set(info *GPUInfo, ttl time.Duration) {
func (c *GPUDiscoveryCache) Set(sku nvidiacomv1beta1.GPUSKUType, info *GPUInfo, ttl time.Duration) {
c.mu.Lock()
defer c.mu.Unlock()
c.value = info
c.expiresAt = time.Now().Add(ttl)
c.entries[sku] = gpuCacheEntry{value: info, expiresAt: time.Now().Add(ttl)}
}
// DiscoverGPUsFromDCGM discovers GPU information by scraping metrics directly
// from DCGM exporter pods running in the cluster.
// DiscoverGPUsFromDCGM is a convenience wrapper that calls
// DiscoverGPUsFromDCGMFiltered with no SKU filter.
// See DiscoverGPUsFromDCGMFiltered for full documentation.
func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient client.Reader, cache *GPUDiscoveryCache) (*GPUInfo, error) {
return g.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, cache, "")
}
// DiscoverGPUsFromDCGMFiltered discovers GPU information by scraping metrics
// directly from DCGM exporter pods running in the cluster.
//
// When filterSKU is non-empty, only nodes whose inferred SKU matches are
// considered. When empty, the best node is selected first (highest GPU count,
// then VRAM) and then only nodes with the same SKU are counted.
//
// The function performs the following:
//
// 1. Returns cached GPU information if still valid.
// 1. Returns cached GPU information if still valid (keyed by filterSKU).
// 2. Lists DCGM exporter pods across all namespaces using supported labels.
// 3. If no pods are found, attempts to find if GPU operator is installed and DCGM is enabled via Helm.
// 4. Warns user appropriately.
// 5. Scrapes each running pods metrics endpoint (http://<podIP>:9400/metrics).
// 6. Selects the "best" GPU node based on:
// 6. Selects the "best" GPU node (filtered by SKU when set) based on:
// - Highest GPU count
// - Highest VRAM per GPU (tie-breaker)
// 7. Caches the result for a short duration to avoid repeated scraping.
// 7. Counts only nodes matching the selected SKU for NodesWithGPUs.
// 8. Caches the result per SKU for a short duration to avoid repeated scraping.
//
// Behavior Notes:
//
......@@ -238,27 +261,17 @@ func (c *GPUDiscoveryCache) Set(info *GPUInfo, ttl time.Duration) {
// - If at least one pod is successfully scraped, partial failures are tolerated.
// - If all pods fail to scrape, an aggregated error is returned.
// - Assumes DCGM exporter runs as a DaemonSet (one pod per GPU node).
// - Designed for homogeneous clusters; heterogeneous cluster aggregation
// is not yet implemented.
//
// Returns:
// - *GPUInfo for the selected node
// - error if no GPU data can be retrieved
//
// TODO: Current implementation selects a single "best" GPU node (highest GPU count,
// tie-broken by VRAM). This works for homogeneous clusters where all GPU
// nodes are identical.
// For Heterogeneous GPU Support (mixed GPU models or capacities), this logic
// does not represent full cluster GPU inventory. Future improvements should
// aggregate and return GPU information for all nodes instead of selecting
// only one.
func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient client.Reader, cache *GPUDiscoveryCache) (*GPUInfo, error) {
func (g *GPUDiscovery) DiscoverGPUsFromDCGMFiltered(ctx context.Context, k8sClient client.Reader, cache *GPUDiscoveryCache, filterSKU nvidiacomv1beta1.GPUSKUType) (*GPUInfo, error) {
if cache != nil {
// Return cached result if still valid
if cached, ok := cache.Get(); ok {
if cached, ok := cache.Get(filterSKU); ok {
return cached, nil
}
}
// List DCGM exporter pods
dcgmPods, err := listDCGMExporterPods(ctx, k8sClient)
if err != nil && !strings.Contains(err.Error(), "no DCGM exporter pods found") {
......@@ -272,12 +285,16 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
}
return nil, err
}
// Scrape each running pod individually
var bestNode *GPUInfo
// Scrape each running pod and collect per-node GPU info.
type nodeInfo struct {
info *GPUInfo
sku nvidiacomv1beta1.GPUSKUType
nodeName string
}
allNodes := make([]nodeInfo, 0, len(dcgmPods))
var scrapeErrors []error
var rdmaDetected bool
var rdmaType string
nodesWithGPUs := 0
for _, pod := range dcgmPods {
if pod.Status.Phase != corev1.PodRunning || pod.Status.PodIP == "" {
continue
......@@ -288,46 +305,78 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
scrapeErrors = append(scrapeErrors, fmt.Errorf("pod %s (%s): %w", pod.Name, pod.Status.PodIP, err))
continue
}
// Detect RDMA on the node of this pod
rdma, rType := detectRDMAFromNode(ctx, k8sClient, pod.Spec.NodeName)
if rdma {
rdmaDetected = true
rdmaType = rType
allNodes = append(allNodes, nodeInfo{info: info, sku: InferHardwareSystem(info.Model), nodeName: pod.Spec.NodeName})
}
if len(allNodes) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("failed to scrape any DCGM exporter pod: %v", scrapeErrors)
}
return nil, fmt.Errorf("no GPU metrics could be parsed from any DCGM pod")
}
// Select best node (only from matching SKU when filtered).
var bestNode *GPUInfo
var bestSKU nvidiacomv1beta1.GPUSKUType
for _, n := range allNodes {
if filterSKU != "" && n.sku != filterSKU {
continue
}
// Increment NodesWithGPUs for every node that successfully reports GPU metrics
nodesWithGPUs++
// Select best node: highest GPU count, tie-breaker by VRAM
if bestNode == nil ||
info.GPUsPerNode > bestNode.GPUsPerNode ||
(info.GPUsPerNode == bestNode.GPUsPerNode &&
info.VRAMPerGPU > bestNode.VRAMPerGPU) {
bestNode = info
n.info.GPUsPerNode > bestNode.GPUsPerNode ||
(n.info.GPUsPerNode == bestNode.GPUsPerNode &&
n.info.VRAMPerGPU > bestNode.VRAMPerGPU) {
bestNode = n.info
bestSKU = n.sku
}
}
if bestNode == nil {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("failed to scrape any DCGM exporter pod: %v", scrapeErrors)
if filterSKU != "" {
return nil, fmt.Errorf("no GPU nodes matching SKU %q found", filterSKU)
}
return nil, fmt.Errorf("no GPU metrics could be parsed from any DCGM pod")
}
// --- Detect RDMA and InfiniBand presence ---
// Count only nodes with the same SKU as the selected best node,
// and detect RDMA on matching nodes only.
nodesWithGPUs := 0
var rdmaDetected bool
var rdmaType string
for _, n := range allNodes {
if n.sku != bestSKU {
continue
}
nodesWithGPUs++
if !rdmaDetected {
rdma, rType := detectRDMAFromNode(ctx, k8sClient, n.nodeName)
if rdma {
rdmaDetected = true
rdmaType = rType
}
}
}
// Detect InfiniBand presence
ib := detectIBPods(ctx, k8sClient)
if ib {
rdmaType = "infiniband"
rdmaDetected = true
}
// Infer cloud provider for the best node
cloudProvider, err := GetCloudProviderInfo(ctx, k8sClient)
if err != nil {
cloudProvider = CloudProviderUnknown
}
bestNode.System = bestSKU
bestNode.CloudProvider = cloudProvider
bestNode.NodesWithGPUs = nodesWithGPUs
bestNode.RDMAEnabled = rdmaDetected
bestNode.RDMAType = rdmaType
if cache != nil {
// Cache result for 60 seconds
cache.Set(bestNode, 60*time.Second)
cache.Set(filterSKU, bestNode, 60*time.Second)
}
return bestNode, nil
}
......
......@@ -809,6 +809,68 @@ func TestDiscoverGPUsFromDCGM_CacheHit(t *testing.T) {
require.Equal(t, info1, info2)
}
func TestDiscoverGPUsFromDCGMFiltered_MixedSKU(t *testing.T) {
ctx := context.Background()
// Two DCGM pods, one per node
pods := []client.Object{
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "dcgm-h100", Namespace: "gpu-operator",
Labels: map[string]string{LabelApp: LabelValueNvidiaDCGMExporter}},
Status: corev1.PodStatus{Phase: corev1.PodRunning, PodIP: "10.0.0.1"},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "dcgm-a100", Namespace: "gpu-operator",
Labels: map[string]string{LabelApp: LabelValueNvidiaDCGMExporter}},
Status: corev1.PodStatus{Phase: corev1.PodRunning, PodIP: "10.0.0.2"},
},
}
scheme := runtime.NewScheme()
require.NoError(t, corev1.AddToScheme(scheme))
k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(pods...).Build()
// Return different GPU models per pod IP. H100 has more VRAM to win tie-breaking.
mockScraper := func(ctx context.Context, endpoint string) (*GPUInfo, error) {
if strings.Contains(endpoint, "10.0.0.1") {
return &GPUInfo{NodeName: "node-h100", GPUsPerNode: 8, Model: "H100-SXM5-80GB", VRAMPerGPU: 81920}, nil
}
return &GPUInfo{NodeName: "node-a100", GPUsPerNode: 8, Model: "A100-SXM4-80GB", VRAMPerGPU: 40960}, nil
}
discovery := NewGPUDiscovery(mockScraper)
t.Run("unfiltered selects best and counts only matching SKU", func(t *testing.T) {
info, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, nil, "")
require.NoError(t, err)
assert.Equal(t, "h100_sxm", string(info.System))
assert.Equal(t, 1, info.NodesWithGPUs, "should count only H100 nodes")
})
t.Run("filter by a100_sxm", func(t *testing.T) {
info, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, nil, "a100_sxm")
require.NoError(t, err)
assert.Equal(t, "a100_sxm", string(info.System))
assert.Equal(t, 1, info.NodesWithGPUs)
assert.Equal(t, "A100-SXM4-80GB", info.Model)
})
t.Run("filter by nonexistent SKU", func(t *testing.T) {
_, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, nil, "l40s")
require.Error(t, err)
assert.Contains(t, err.Error(), "no GPU nodes matching SKU")
})
t.Run("cache is per SKU", func(t *testing.T) {
cache := NewGPUDiscoveryCache()
info1, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, cache, "")
require.NoError(t, err)
info2, err := discovery.DiscoverGPUsFromDCGMFiltered(ctx, k8sClient, cache, "a100_sxm")
require.NoError(t, err)
assert.NotEqual(t, info1.System, info2.System, "different SKU filters should return different results")
})
}
func TestDiscoverGPUsFromDCGM_GPUOperatorInstalled_DCgmNotEnabled(t *testing.T) {
ctx := context.Background()
......
......@@ -1570,8 +1570,12 @@ _Appears in:_
HardwareSpec describes the hardware resources available for profiling and deployment.
These fields are typically auto-filled by the operator from cluster discovery.
HardwareSpec describes the GPU hardware for profiling and deployment.
All fields are auto-detected from cluster GPU nodes when omitted
(requires cluster-wide mode with GPU discovery enabled).
gpuSku is a selector (restricts which nodes are considered);
the other fields are pure overrides passed to the profiler.
If all four fields are set, discovery is skipped.
......@@ -1580,10 +1584,10 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU selects the GPU type to target.<br />When omitted, auto-detected by selecting the GPU with the highest<br />node count, then highest VRAM. In mixed-GPU clusters, set this to<br />choose which GPU type to use. Discovery and totalGpus are then<br />restricted to nodes matching this SKU. | | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB.<br />When omitted, auto-detected from cluster GPU nodes. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the GPU budget for profiling and deployment.<br />The profiler uses this to determine parallelism and replica count.<br />When omitted, computed by counting GPUs on discovered nodes<br />(filtered by gpuSku when set), temporarily capped at 32 to<br />limit profiler search space. This cap may be removed in a future<br />release. Set this field explicitly to override. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node.<br />When omitted, auto-detected from cluster GPU nodes. | | Optional: \{\} <br /> |
| `interconnect` _string_ | Interconnect describes the primary GPU-to-GPU interconnect *within a node*.<br />Semantics / usage:<br /> - This is capability metadata used for profiling, planning, and deployment decisions.<br /> - It does NOT configure or enable any GPU interconnect; it only describes what is available/assumed.<br /> - When omitted, the operator may attempt best-effort discovery (currently distinguishes "nvlink"<br /> vs "pcie" based on DCGM NVLink link count). If discovery is unavailable, it may remain empty.<br />Impact of wrong / missing values:<br /> - If set more optimistically than reality (e.g., "nvlink" when only PCIe is present), performance<br /> models may overestimate intra-node bandwidth and choose overly aggressive parallelism or layouts,<br /> resulting in degraded performance compared to expectations.<br /> - If set more pessimistically than reality (e.g., "pcie" when NVLink is present), the system may<br /> choose conservative plans and leave performance on the table.<br /> - If unset and undiscovered, consumers should treat the interconnect as unknown and fall back to<br /> conservative assumptions.<br />Example values: "pcie", "nvlink". Other values may be accepted but may not be auto-detected. | | Optional: \{\} <br /> |
| `rdma` _boolean_ | RDMA indicates whether the cluster has RDMA-capable networking available for Dynamo data movement.<br />Semantics / usage:<br /> - This is capability metadata used for profiling, planning, and deployment decisions.<br /> - It does NOT install, enable, or configure RDMA (e.g., drivers, SR-IOV, NVIDIA network operator,<br /> GPUDirect settings). It only expresses availability/intent.<br /> - When omitted, the operator may attempt best-effort discovery (e.g., via node labels indicating<br /> RDMA/SR-IOV capability and/or presence of NVIDIA network-operator RDMA components). If discovery<br /> is unavailable, it may remain unset.<br />Impact of wrong / missing values:<br /> - False positive (set true when RDMA is not actually usable end-to-end) may cause plans or<br /> deployments to assume RDMA is available; depending on the runtime transport selection and<br /> fallback behavior, this can lead to connection/setup failures or performance regressions.<br /> - False negative (set false when RDMA is available) will typically avoid RDMA-optimized paths and<br /> fall back to non-RDMA transports, usually remaining functional but potentially slower.<br /> - If unset and undiscovered, consumers should treat RDMA availability as unknown and use<br /> conservative defaults / fallback transports. | | Optional: \{\} <br /> |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment