Merge remote-tracking branch 'upstream/main' into pr3702

920a4b07 · Daniel Hiltgen · c496967e · ee49844d · 920a4b07 · 920a4b07
Commit 920a4b07 authored May 08, 2024 by Daniel Hiltgen
20 changed files
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -11,6 +11,8 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"github.com/ollama/ollama/format"
 )
 // Discovery logic for AMD/ROCm GPUs
@@ -23,26 +25,20 @@ const (
 	// Prefix with the node dir
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
-	RocmStandardLocation   = "/opt/rocm/lib"
-	// TODO find a better way to detect iGPU instead of minimum memory
-	IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
 )
 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
 )
 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
-// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
+func AMDGetGPUInfo() []GpuInfo {
-// and the user hasn't already set this variable
+	resp := []GpuInfo{}
-func AMDGetGPUInfo(resp *GpuInfo) {
-	// TODO - DRY this out with windows
 	if !AMDDetected() {
-		return
+		return resp
 	}
-	skip := map[int]interface{}{}
 	// Opportunistic logging of driver version to aid in troubleshooting
 	ver, err := AMDDriverVersion()
@@ -50,160 +46,117 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 		slog.Info("AMD Driver: " + ver)
 	} else {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
+		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}
-	// If the user has specified exactly which GPUs to use, look up their memory
+	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
-	visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
+	var visibleDevices []string
-	if visibleDevices != "" {
+	hipVD := os.Getenv("HIP_VISIBLE_DEVICES")   // zero based index only
-		ids := []int{}
+	rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
-		for _, idStr := range strings.Split(visibleDevices, ",") {
+	gpuDO := os.Getenv("GPU_DEVICE_ORDINAL")    // zero based index
-			id, err := strconv.Atoi(idStr)
+	switch {
-			if err != nil {
+	// TODO is this priorty order right?
-				slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
+	case hipVD != "":
-			} else {
+		visibleDevices = strings.Split(hipVD, ",")
-				ids = append(ids, id)
+	case rocrVD != "":
-			}
+		visibleDevices = strings.Split(rocrVD, ",")
-		}
+		// TODO - since we don't yet support UUIDs, consider detecting and reporting here
-		amdProcMemLookup(resp, nil, ids)
+		// all our test systems show GPU-XX indicating UUID is not supported
-		return
+	case gpuDO != "":
+		visibleDevices = strings.Split(gpuDO, ",")
 	}
-	// Gather GFX version information from all detected cards
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
-	gfx := AMDGFXVersions()
+	var supported []string
-	verStrings := []string{}
+	libDir := ""
-	for i, v := range gfx {
-		verStrings = append(verStrings, v.ToGFXString())
+	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
-		if v.Major == 0 {
+	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
-			// Silently skip CPUs
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-			skip[i] = struct{}{}
+	cpuCount := 0
+	for _, match := range matches {
+		slog.Debug("evaluating amdgpu node " + match)
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug("failed to open sysfs node", "file", match, "error", err)
 			continue
 		}
-		if v.Major < 9 {
+		defer fp.Close()
-			// TODO consider this a build-time setting if we can support 8xx family GPUs
+		nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
-			slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
+		if err != nil {
-			skip[i] = struct{}{}
+			slog.Debug("failed to parse node ID", "error", err)
+			continue
 		}
-	}
-	slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
-	// Abort if all GPUs are skipped
+		scanner := bufio.NewScanner(fp)
-	if len(skip) >= len(gfx) {
+		isCPU := false
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		var major, minor, patch uint64
-		return
+		for scanner.Scan() {
-	}
+			line := strings.TrimSpace(scanner.Text())
+			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
-	// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
+				// Detect CPUs
-	libDir, err := AMDValidateLibDir()
+				if len(ver) == 2 && ver[1] == "0" {
-	if err != nil {
+					slog.Debug("detected CPU " + match)
-		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
+					isCPU = true
-		return
+					break
-	}
+				}
-	updateLibPath(libDir)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+					slog.Warn("malformed "+match, "gfx_target_version", line)
+					// If this winds up being a CPU, our offsets may be wrong
+					continue
+				}
+				l := len(ver[1])
+				var err1, err2, err3 error
+				patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+			}
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+			// TODO - any other properties we want to extract and record?
-	if gfxOverride == "" {
+			// vendor_id + device_id -> pci lookup for "Name"
-		supported, err := GetSupportedGFX(libDir)
+			// Other metrics that may help us understand relative performance between multiple GPUs
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
-			return
 		}
-		slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
-		for i, v := range gfx {
+		if isCPU {
-			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
+			cpuCount++
-				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
+			continue
-				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
-				skip[i] = struct{}{}
-			} else {
-				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
-			}
 		}
-	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
-	}
-	if len(skip) >= len(gfx) {
+		// CPUs are always first in the list
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		gpuID := nodeID - cpuCount
-		return
-	}
-	ids := make([]int, len(gfx))
+		// Shouldn't happen, but just in case...
-	i := 0
+		if gpuID < 0 {
-	for k := range gfx {
+			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
-		ids[i] = k
+			return []GpuInfo{}
-		i++
-	}
-	amdProcMemLookup(resp, skip, ids)
-	if resp.memInfo.DeviceCount == 0 {
-		return
-	}
-	if len(skip) > 0 {
-		amdSetVisibleDevices(ids, skip)
-	}
-}
-func updateLibPath(libDir string) {
-	ldPaths := []string{}
-	if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
-		ldPaths = strings.Split(val, ":")
-	}
-	for _, d := range ldPaths {
-		if d == libDir {
-			return
-		}
-	}
-	val := strings.Join(append(ldPaths, libDir), ":")
-	slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
-	os.Setenv("LD_LIBRARY_PATH", val)
-}
-// Walk the sysfs nodes for the available GPUs and gather information from them
-// skipping over any devices in the skip map
-func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
-	resp.memInfo.DeviceCount = 0
-	resp.memInfo.TotalMemory = 0
-	resp.memInfo.FreeMemory = 0
-	slog.Debug("discovering VRAM for amdgpu devices")
-	if len(ids) == 0 {
-		entries, err := os.ReadDir(AMDNodesSysfsDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
-			return
-		}
-		for _, node := range entries {
-			if !node.IsDir() {
-				continue
-			}
-			id, err := strconv.Atoi(node.Name())
-			if err != nil {
-				slog.Warn("malformed amdgpu sysfs node id " + node.Name())
-				continue
-			}
-			ids = append(ids, id)
 		}
-	}
-	slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
-	for _, id := range ids {
+		if int(major) < RocmComputeMin {
-		if _, skipped := skip[id]; skipped {
+			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}
+		// Look up the memory for the current node
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
-		// Adjust for sysfs vs HIP ids
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
-		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
 		propFiles, err := filepath.Glob(propGlob)
 		if err != nil {
-			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
+			slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
 		}
 		// 1 or more memory banks - sum the values of all of them
 		for _, propFile := range propFiles {
 			fp, err := os.Open(propFile)
 			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
+				slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
 				continue
 			}
 			defer fp.Close()
@@ -226,49 +179,113 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 		}
 		if totalMemory == 0 {
-			slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
+			slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
-			skip[id] = struct{}{}
 			continue
 		}
-		if totalMemory < IGPUMemLimit {
+		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
-			slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
-			skip[id] = struct{}{}
-			continue
-		}
-		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
 		usedFiles, err := filepath.Glob(usedGlob)
 		if err != nil {
-			slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
+			slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
 			continue
 		}
 		for _, usedFile := range usedFiles {
 			fp, err := os.Open(usedFile)
 			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
+				slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
 				continue
 			}
 			defer fp.Close()
 			data, err := io.ReadAll(fp)
 			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
+				slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
 				continue
 			}
 			used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
 			if err != nil {
-				slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
+				slog.Warn("malformed used memory", "data", string(data), "error", err)
 				continue
 			}
 			usedMemory += used
 		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %dM", id, (totalMemory-usedMemory)/1024/1024))
+		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
-		resp.memInfo.DeviceCount++
+		if totalMemory < IGPUMemLimit {
-		resp.memInfo.TotalMemory += totalMemory
+			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
-		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
+			continue
+		}
+		slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		gpuInfo := GpuInfo{
+			Library: "rocm",
+			memInfo: memInfo{
+				TotalMemory: totalMemory,
+				FreeMemory:  (totalMemory - usedMemory),
+			},
+			ID: fmt.Sprintf("%d", gpuID),
+			// Name: not exposed in sysfs directly, would require pci device id lookup
+			Major:         int(major),
+			Minor:         int(minor),
+			Patch:         int(patch),
+			MinimumMemory: rocmMinimumMemory,
+		}
+		// If the user wants to filter to a subset of devices, filter out if we aren't a match
+		if len(visibleDevices) > 0 {
+			include := false
+			for _, visible := range visibleDevices {
+				if visible == gpuInfo.ID {
+					include = true
+					break
+				}
+			}
+			if !include {
+				slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
+				continue
+			}
+		}
+		// Final validation is gfx compatibility - load the library if we haven't already loaded it
+		// even if the user overrides, we still need to validate the library
+		if libDir == "" {
+			libDir, err = AMDValidateLibDir()
+			if err != nil {
+				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
+				return []GpuInfo{}
+			}
+		}
+		gpuInfo.DependencyPath = libDir
+		if gfxOverride == "" {
+			// Only load supported list once
+			if len(supported) == 0 {
+				supported, err = GetSupportedGFX(libDir)
+				if err != nil {
+					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
+					return []GpuInfo{}
+				}
+				slog.Debug("rocm supported GPUs", "types", supported)
+			}
+			gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
+			if !slices.Contains[[]string, string](supported, gfx) {
+				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
+				// TODO - consider discrete markdown just for ROCM troubleshooting?
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
+				continue
+			} else {
+				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
+			}
+		} else {
+			slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+		}
+		// The GPU has passed all the verification steps and is supported
+		resp = append(resp, gpuInfo)
 	}
-	if resp.memInfo.DeviceCount > 0 {
+	if len(resp) == 0 {
-		resp.Library = "rocm"
+		slog.Info("no compatible amdgpu devices detected")
 	}
+	return resp
 }
 // Quick check for AMD driver so we can skip amdgpu discovery if not present
@@ -280,87 +297,24 @@ func AMDDetected() bool {
 		slog.Debug("amdgpu driver not detected " + sysfsDir)
 		return false
 	} else if err != nil {
-		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
+		slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
 		return false
 	}
 	return true
 }
-func setupLink(source, target string) error {
-	if err := os.RemoveAll(target); err != nil {
-		return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
-	}
-	if err := os.Symlink(source, target); err != nil {
-		return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
-	}
-	slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
-	return nil
-}
-// Ensure the AMD rocm lib dir is wired up
 // Prefer to use host installed ROCm, as long as it meets our minimum requirements
 // failing that, tell the user how to download it on their own
 func AMDValidateLibDir() (string, error) {
-	// We rely on the rpath compiled into our library to find rocm
+	libDir, err := commonAMDValidateLibDir()
-	// so we establish a symlink to wherever we find it on the system
-	// to <payloads>/rocm
-	payloadsDir, err := PayloadsDir()
-	if err != nil {
-		return "", err
-	}
-	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
-	if rocmLibUsable(rocmTargetDir) {
-		return rocmTargetDir, nil
-	}
-	// next to the running binary
-	exe, err := os.Executable()
 	if err == nil {
-		peerDir := filepath.Dir(exe)
+		return libDir, nil
-		if rocmLibUsable(peerDir) {
-			slog.Debug("detected ROCM next to ollama executable " + peerDir)
-			return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
-		}
-		peerDir = filepath.Join(filepath.Dir(exe), "rocm")
-		if rocmLibUsable(peerDir) {
-			slog.Debug("detected ROCM next to ollama executable " + peerDir)
-			return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
-		}
 	}
 	// Well known ollama installer path
 	installedRocmDir := "/usr/share/ollama/lib/rocm"
 	if rocmLibUsable(installedRocmDir) {
-		return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
+		return installedRocmDir, nil
-	}
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "lib")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
-		}
-	}
-	// Scan the library path for potential matches
-	ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	for _, ldPath := range ldPaths {
-		d, err := filepath.Abs(ldPath)
-		if err != nil {
-			continue
-		}
-		if rocmLibUsable(d) {
-			return rocmTargetDir, setupLink(d, rocmTargetDir)
-		}
-	}
-	// Well known location(s)
-	if rocmLibUsable("/opt/rocm/lib") {
-		return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
 	}
 	// If we still haven't found a usable rocm, the user will have to install it on their own
@@ -384,68 +338,3 @@ func AMDDriverVersion() (string, error) {
 	}
 	return strings.TrimSpace(string(verString)), nil
 }
-func AMDGFXVersions() map[int]Version {
-	// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
-	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
-	res := map[int]Version{}
-	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	for _, match := range matches {
-		fp, err := os.Open(match)
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
-			continue
-		}
-		defer fp.Close()
-		i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
-			continue
-		}
-		if i == 0 {
-			// Skipping the CPU
-			continue
-		}
-		// Align with HIP IDs (zero is first GPU, not CPU)
-		i -= 1
-		scanner := bufio.NewScanner(fp)
-		for scanner.Scan() {
-			line := strings.TrimSpace(scanner.Text())
-			if strings.HasPrefix(line, "gfx_target_version") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 || len(ver[1]) < 5 {
-					if ver[1] != "0" {
-						slog.Debug("malformed " + line)
-					}
-					res[i] = Version{
-						Major: 0,
-						Minor: 0,
-						Patch: 0,
-					}
-					continue
-				}
-				l := len(ver[1])
-				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
-				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
-				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
-				if err1 != nil || err2 != nil || err3 != nil {
-					slog.Debug("malformed int " + line)
-					continue
-				}
-				res[i] = Version{
-					Major: uint(major),
-					Minor: uint(minor),
-					Patch: uint(patch),
-				}
-			}
-		}
-	}
-	return res
-}
-func (v Version) ToGFXString() string {
-	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
-}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,11 +7,13 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
+	"strconv"
 	"strings"
+	"github.com/ollama/ollama/format"
 )
 const (
-	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
 	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
 	iGPUName = "AMD Radeon(TM) Graphics"
@@ -19,39 +21,36 @@ const (
 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
+	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
+	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
 )
-func AMDGetGPUInfo(resp *GpuInfo) {
+func AMDGetGPUInfo() []GpuInfo {
+	resp := []GpuInfo{}
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
-		return
+		return nil
 	}
 	defer hl.Release()
-	skip := map[int]interface{}{}
-	ids := []int{}
-	resp.memInfo.DeviceCount = 0
-	resp.memInfo.TotalMemory = 0
-	resp.memInfo.FreeMemory = 0
 	ver, err := hl.AMDDriverVersion()
 	if err == nil {
 		slog.Info("AMD Driver: " + ver)
 	} else {
 		// For now this is benign, but we may eventually need to fail compatibility checks
-		slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
+		slog.Debug("error looking up amd driver version", "error", err)
 	}
-	// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
+	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
 	if count == 0 {
-		return
+		return nil
 	}
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
-		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
+		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
-		return
+		return nil
 	}
 	var supported []string
@@ -59,95 +58,120 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
+			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
-			return
+			return nil
 		}
 	} else {
 		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
 	}
-	slog.Info(fmt.Sprintf("detected %d hip devices", count))
+	slog.Info("detected hip devices", "count", count)
+	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
 	for i := 0; i < count; i++ {
-		ids = append(ids, i)
 		err = hl.HipSetDevice(i)
 		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			slog.Warn("set device", "id", i, "error", err)
-			skip[i] = struct{}{}
 			continue
 		}
 		props, err := hl.HipGetDeviceProperties(i)
 		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			slog.Warn("get properties", "id", i, "error", err)
-			skip[i] = struct{}{}
 			continue
 		}
 		n := bytes.IndexByte(props.Name[:], 0)
 		name := string(props.Name[:n])
-		slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
+		// TODO is UUID actually populated on windows?
+		// Can luid be used on windows for setting visible devices (and is it actually set?)
 		n = bytes.IndexByte(props.GcnArchName[:], 0)
 		gfx := string(props.GcnArchName[:n])
-		slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
+		slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
+		var major, minor, patch string
+		switch len(gfx) {
+		case 6:
+			major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
+		case 7:
+			major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
+		}
 		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		if strings.EqualFold(name, iGPUName) {
-			slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
+			slog.Info("iGPU detected skipping", "id", i)
-			skip[i] = struct{}{}
 			continue
 		}
 		if gfxOverride == "" {
 			if !slices.Contains[[]string, string](supported, gfx) {
-				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
+				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
-				skip[i] = struct{}{}
 				continue
 			} else {
-				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
+				slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 			}
 		}
-		totalMemory, freeMemory, err := hl.HipMemGetInfo()
+		freeMemory, totalMemory, err := hl.HipMemGetInfo()
 		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			slog.Warn("get mem info", "id", i, "error", err)
 			continue
 		}
-		// TODO according to docs, freeMem may lie on windows!
+		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
-		slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
+		if totalMemory < IGPUMemLimit {
-		slog.Info(fmt.Sprintf("[%d] Free Mem:  %d", i, freeMemory))
+			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
-		resp.memInfo.DeviceCount++
+			continue
-		resp.memInfo.TotalMemory += totalMemory
+		}
-		resp.memInfo.FreeMemory += freeMemory
-	}
+		// TODO revisit this once ROCm v6 is available on windows.
-	if resp.memInfo.DeviceCount > 0 {
+		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
-		resp.Library = "rocm"
+		slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
-	}
+		slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
-	// Abort if all GPUs are skipped
+		gpuInfo := GpuInfo{
-	if len(skip) >= count {
+			Library: "rocm",
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+			memInfo: memInfo{
-		return
+				TotalMemory: totalMemory,
-	}
+				FreeMemory:  freeMemory,
-	if len(skip) > 0 {
+			},
-		amdSetVisibleDevices(ids, skip)
+			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
+			DependencyPath: libDir,
+			MinimumMemory:  rocmMinimumMemory,
+		}
+		if major != "" {
+			gpuInfo.Major, err = strconv.Atoi(major)
+			if err != nil {
+				slog.Info("failed to parse version", "version", gfx, "error", err)
+			}
+		}
+		if minor != "" {
+			gpuInfo.Minor, err = strconv.Atoi(minor)
+			if err != nil {
+				slog.Info("failed to parse version", "version", gfx, "error", err)
+			}
+		}
+		if patch != "" {
+			// Patch rev is hex; e.g. gfx90a
+			p, err := strconv.ParseInt(patch, 16, 0)
+			if err != nil {
+				slog.Info("failed to parse version", "version", gfx, "error", err)
+			} else {
+				gpuInfo.Patch = int(p)
+			}
+		}
+		if gpuInfo.Major < RocmComputeMin {
+			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
+			continue
+		}
+		resp = append(resp, gpuInfo)
 	}
-	UpdatePath(libDir)
+	return resp
 }
 func AMDValidateLibDir() (string, error) {
-	// On windows non-admins typically can't create links
+	libDir, err := commonAMDValidateLibDir()
-	// so instead of trying to rely on rpath and a link in
-	// $LibDir/rocm, we instead rely on setting PATH to point
-	// to the location of the ROCm library
-	// Installer payload location if we're running the installed binary
-	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		return libDir, nil
-		if rocmLibUsable(rocmTargetDir) {
-			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-			return rocmTargetDir, nil
-		}
 	}
 	// Installer payload (if we're running from some other location)
@@ -159,21 +183,6 @@ func AMDValidateLibDir() (string, error) {
 		return rocmTargetDir, nil
 	}
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "bin")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return hipLibDir, nil
-		}
-	}
-	// Well known location(s)
-	if rocmLibUsable(RocmStandardLocation) {
-		return RocmStandardLocation, nil
-	}
 	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
 	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")

--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -12,6 +12,8 @@ import (
 	"sync"
 	"syscall"
 	"time"
+	"github.com/ollama/ollama/server/envconfig"
 )
 var (
@@ -24,8 +26,16 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
+		runnersDir := envconfig.RunnersDir
+		if runnersDir != "" {
+			payloadsDir = runnersDir
+			return payloadsDir, nil
+		}
+		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := os.Getenv("OLLAMA_TMPDIR")
+		tmpDir := envconfig.TmpDir
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@@ -80,7 +90,7 @@ func cleanupTmpDirs() {
 		}
 		err = os.RemoveAll(d)
 		if err != nil {
-			slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
+			slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
 		}
 	}
 }
@@ -88,7 +98,8 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	if payloadsDir != "" {
+	runnersDir := envconfig.RunnersDir
+	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
 		slog.Debug("cleaning up", "dir", tmpDir)
@@ -120,7 +131,7 @@ func UpdatePath(dir string) {
 			}
 		}
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
+		slog.Info("updating", "PATH", newPath)
 		os.Setenv("PATH", newPath)
 	}
 	// linux and darwin rely on rpath

--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
+//go:build linux || windows
+package gpu
+import (
+	"log/slog"
+	"strings"
+)
+func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "cuda" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -16,22 +16,23 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/server/envconfig"
 )
 type handles struct {
-	nvml   *C.nvml_handle_t
+	deviceCount int
-	cudart *C.cudart_handle_t
+	cudart      *C.cudart_handle_t
+	nvcuda      *C.nvcuda_handle_t
 }
 const (
-	cudaMinimumMemory = 457 * format.MebiByte
+	cudaMinimumMemory = 256 * format.MebiByte
-	rocmMinimumMemory = 457 * format.MebiByte
+	rocmMinimumMemory = 256 * format.MebiByte
 )
 var gpuMutex sync.Mutex
@@ -39,26 +40,10 @@ var gpuMutex sync.Mutex
 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
-// Possible locations for the nvidia-ml library
+var RocmComputeMin = 9
-var NvmlLinuxGlobs = []string{
-	"/usr/local/cuda/lib64/libnvidia-ml.so*",
-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
-	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
-	"/usr/lib/wsl/lib/libnvidia-ml.so*",
-	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
-	"/opt/cuda/lib64/libnvidia-ml.so*",
-	"/usr/lib*/libnvidia-ml.so*",
-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
-	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
-	"/usr/local/lib*/libnvidia-ml.so*",
-	// TODO: are these stubs ever valid?
-	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
-}
-var NvmlWindowsGlobs = []string{
+// TODO find a better way to detect iGPU instead of minimum memory
-	"c:\\Windows\\System32\\nvml.dll",
+const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
-}
 var CudartLinuxGlobs = []string{
 	"/usr/local/cuda/lib64/libcudart.so*",
@@ -79,6 +64,22 @@ var CudartWindowsGlobs = []string{
 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
 }
+var NvcudaLinuxGlobs = []string{
+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+	"/usr/lib/*-linux-gnu/libcuda.so*",
+	"/usr/lib/wsl/lib/libcuda.so*",
+	"/usr/lib/wsl/drivers/*/libcuda.so*",
+	"/opt/cuda/lib*/libcuda.so*",
+	"/usr/local/cuda/lib*/libcuda.so*",
+	"/usr/lib*/libcuda.so*",
+	"/usr/local/lib*/libcuda.so*",
+}
+var NvcudaWindowsGlobs = []string{
+	"c:\\windows\\system*\\nvcuda.dll",
+}
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -88,61 +89,62 @@ func initGPUHandles() *handles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
-	gpuHandles := &handles{nil, nil}
+	gpuHandles := &handles{}
-	var nvmlMgmtName string
-	var nvmlMgmtPatterns []string
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
+	var nvcudaMgmtName string
+	var nvcudaMgmtPatterns []string
 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
 	case "windows":
-		nvmlMgmtName = "nvml.dll"
-		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
-		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
 		cudartMgmtName = "cudart64_*.dll"
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
+		// Aligned with driver, we can't carry as payloads
+		nvcudaMgmtName = "nvcuda.dll"
+		nvcudaMgmtPatterns = NvcudaWindowsGlobs
 	case "linux":
-		nvmlMgmtName = "libnvidia-ml.so"
-		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
-		copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
 			// TODO - add "payloads" for subprocess
 			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
+		// Aligned with driver, we can't carry as payloads
+		nvcudaMgmtName = "libcuda.so*"
+		nvcudaMgmtPatterns = NvcudaLinuxGlobs
 	default:
 		return gpuHandles
 	}
-	slog.Info("Detecting GPU type")
+	slog.Info("Detecting GPUs")
-	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
+	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
-	if len(cudartLibPaths) > 0 {
+	if len(nvcudaLibPaths) > 0 {
-		cudart := LoadCUDARTMgmt(cudartLibPaths)
+		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
-		if cudart != nil {
+		if nvcuda != nil {
-			slog.Info("Nvidia GPU detected via cudart")
+			slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
-			gpuHandles.cudart = cudart
+			gpuHandles.nvcuda = nvcuda
+			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
 		}
 	}
-	// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
+	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
-	nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
+	if len(cudartLibPaths) > 0 {
-	if len(nvmlLibPaths) > 0 {
+		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
-		nvml := LoadNVMLMgmt(nvmlLibPaths)
+		if cudart != nil {
-		if nvml != nil {
+			slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
-			slog.Info("Nvidia GPU detected via nvidia-ml")
+			gpuHandles.cudart = cudart
-			gpuHandles.nvml = nvml
+			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
 		}
 	}
 	return gpuHandles
 }
-func GetGPUInfo() GpuInfo {
+func GetGPUInfo() GpuInfoList {
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
@@ -150,12 +152,12 @@ func GetGPUInfo() GpuInfo {
 	gpuHandles := initGPUHandles()
 	defer func() {
-		if gpuHandles.nvml != nil {
-			C.nvml_release(*gpuHandles.nvml)
-		}
 		if gpuHandles.cudart != nil {
 			C.cudart_release(*gpuHandles.cudart)
 		}
+		if gpuHandles.nvcuda != nil {
+			C.nvcuda_release(*gpuHandles.nvcuda)
+		}
 	}()
 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
@@ -164,73 +166,75 @@ func GetGPUInfo() GpuInfo {
 		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
 	}
+	// On windows we bundle the nvidia library one level above the runner dir
+	depPath := ""
+	if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+		depPath = filepath.Dir(envconfig.RunnersDir)
+	}
 	var memInfo C.mem_info_t
-	resp := GpuInfo{}
+	resp := []GpuInfo{}
-	if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
+	// NVIDIA first
-		if memInfo.err != nil {
+	for i := 0; i < gpuHandles.deviceCount; i++ {
-			slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
+		// TODO once we support CPU compilation variants of GPU libraries refine this...
-			C.free(unsafe.Pointer(memInfo.err))
+		if cpuVariant == "" && runtime.GOARCH == "amd64" {
-		} else if memInfo.count > 0 {
+			continue
-			// Verify minimum compute capability
+		}
-			var cc C.nvml_compute_capability_t
+		gpuInfo := GpuInfo{
-			C.nvml_compute_capability(*gpuHandles.nvml, &cc)
+			Library: "cuda",
-			if cc.err != nil {
+		}
-				slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
+		if gpuHandles.cudart != nil {
-				C.free(unsafe.Pointer(cc.err))
+			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
-			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
+		} else {
-				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
+			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
-				resp.Library = "cuda"
-				resp.MinimumMemory = cudaMinimumMemory
-			} else {
-				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
-			}
 		}
-	} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
 		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
+			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
-		} else if memInfo.count > 0 {
+			continue
-			// Verify minimum compute capability
-			var cc C.cudart_compute_capability_t
-			C.cudart_compute_capability(*gpuHandles.cudart, &cc)
-			if cc.err != nil {
-				slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
-				C.free(unsafe.Pointer(cc.err))
-			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
-				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
-				resp.Library = "cuda"
-				resp.MinimumMemory = cudaMinimumMemory
-			} else {
-				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
-			}
 		}
-	} else {
+		if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
-		AMDGetGPUInfo(&resp)
+			slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
-		if resp.Library != "" {
+			continue
-			resp.MinimumMemory = rocmMinimumMemory
-			return resp
 		}
+		gpuInfo.TotalMemory = uint64(memInfo.total)
+		gpuInfo.FreeMemory = uint64(memInfo.free)
+		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+		gpuInfo.Major = int(memInfo.major)
+		gpuInfo.Minor = int(memInfo.minor)
+		gpuInfo.MinimumMemory = cudaMinimumMemory
+		gpuInfo.DependencyPath = depPath
+		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
+		resp = append(resp, gpuInfo)
 	}
-	if resp.Library == "" {
+	// Then AMD
+	resp = append(resp, AMDGetGPUInfo()...)
+	if len(resp) == 0 {
 		C.cpu_check_ram(&memInfo)
-		resp.Library = "cpu"
+		if memInfo.err != nil {
-		resp.Variant = cpuVariant
+			slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
-	}
+			C.free(unsafe.Pointer(memInfo.err))
-	if memInfo.err != nil {
+			return resp
-		slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
+		}
-		C.free(unsafe.Pointer(memInfo.err))
+		gpuInfo := GpuInfo{
-		return resp
+			Library: "cpu",
+			Variant: cpuVariant,
+		}
+		gpuInfo.TotalMemory = uint64(memInfo.total)
+		gpuInfo.FreeMemory = uint64(memInfo.free)
+		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+		resp = append(resp, gpuInfo)
 	}
-	resp.DeviceCount = uint32(memInfo.count)
-	resp.FreeMemory = uint64(memInfo.free)
-	resp.TotalMemory = uint64(memInfo.total)
 	return resp
 }
-func getCPUMem() (memInfo, error) {
+func GetCPUMem() (memInfo, error) {
 	var ret memInfo
 	var info C.mem_info_t
 	C.cpu_check_ram(&info)
@@ -243,29 +247,12 @@ func getCPUMem() (memInfo, error) {
 	return ret, nil
 }
-func CheckVRAM() (uint64, error) {
+func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseInt(userLimit, 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
-		}
-		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return uint64(avail), nil
-	}
-	gpuInfo := GetGPUInfo()
-	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		return gpuInfo.FreeMemory, nil
-	}
-	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
-}
-func FindGPULibs(baseLibName string, patterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
+	var patterns []string
 	gpuLibPaths := []string{}
-	slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
+	slog.Debug("Searching for GPU library", "name", baseLibName)
 	switch runtime.GOOS {
 	case "windows":
@@ -283,8 +270,14 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 		}
 		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
 	}
-	slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
+	patterns = append(patterns, defaultPatterns...)
+	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
+		// Nvidia PhysX known to return bogus results
+		if strings.Contains(pattern, "PhysX") {
+			slog.Debug("skipping PhysX cuda library path", "path", pattern)
+		}
 		// Ignore glob discovery errors
 		matches, _ := filepath.Glob(pattern)
 		for _, match := range matches {
@@ -311,47 +304,66 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 			}
 		}
 	}
-	slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
+	slog.Debug("discovered GPU libraries", "paths", gpuLibPaths)
 	return gpuLibPaths
 }
-func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
+func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
-	var resp C.nvml_init_resp_t
+	var resp C.cudart_init_resp_t
 	resp.ch.verbose = getVerboseState()
-	for _, libPath := range nvmlLibPaths {
+	for _, libPath := range cudartLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
-		C.nvml_init(lib, &resp)
+		C.cudart_init(lib, &resp)
 		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
+			slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			return &resp.ch
+			return int(resp.num_devices), &resp.ch, libPath
 		}
 	}
-	return nil
+	return 0, nil, ""
 }
-func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
+func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
-	var resp C.cudart_init_resp_t
+	var resp C.nvcuda_init_resp_t
 	resp.ch.verbose = getVerboseState()
-	for _, libPath := range cudartLibPaths {
+	for _, libPath := range nvcudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
-		C.cudart_init(lib, &resp)
+		C.nvcuda_init(lib, &resp)
 		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
+			slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			return &resp.ch
+			return int(resp.num_devices), &resp.ch, libPath
 		}
 	}
-	return nil
+	return 0, nil, ""
 }
 func getVerboseState() C.uint16_t {
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
 }
+// Given the list of GPUs this instantiation is targeted for,
+// figure out the visible devices environment variable
+//
+// If different libraries are detected, the first one is what we use
+func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
+	if len(l) == 0 {
+		return "", ""
+	}
+	switch l[0].Library {
+	case "cuda":
+		return cudaGetVisibleDevicesEnv(l)
+	case "rocm":
+		return rocmGetVisibleDevicesEnv(l)
+	default:
+		slog.Debug("no filter required for library " + l[0].Library)
+		return "", ""
+	}
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -9,52 +9,47 @@ package gpu
 */
 import "C"
 import (
-	"fmt"
-	"log/slog"
-	"os"
 	"runtime"
-	"strconv"
-)
-// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
-func CheckVRAM() (uint64, error) {
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseInt(userLimit, 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
-		}
-		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return uint64(avail), nil
-	}
-	if runtime.GOARCH == "amd64" {
+	"github.com/ollama/ollama/format"
-		// gpu not supported, this may not be metal
+)
-		return 0, nil
-	}
-	return uint64(C.getRecommendedMaxVRAM()), nil
+const (
-}
+	metalMinimumMemory = 384 * format.MebiByte
+)
-func GetGPUInfo() GpuInfo {
+func GetGPUInfo() GpuInfoList {
-	mem, _ := getCPUMem()
+	mem, _ := GetCPUMem()
 	if runtime.GOARCH == "amd64" {
-		return GpuInfo{
+		return []GpuInfo{
-			Library: "cpu",
+			{
-			Variant: GetCPUVariant(),
+				Library: "cpu",
-			memInfo: mem,
+				Variant: GetCPUVariant(),
+				memInfo: mem,
+			},
 		}
 	}
-	return GpuInfo{
+	info := GpuInfo{
 		Library: "metal",
-		memInfo: mem,
+		ID:      "0",
 	}
+	info.TotalMemory = uint64(C.getRecommendedMaxVRAM())
+	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
+	info.FreeMemory = info.TotalMemory
+	info.MinimumMemory = metalMinimumMemory
+	return []GpuInfo{info}
 }
-func getCPUMem() (memInfo, error) {
+func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
-		DeviceCount: 1,
 	}, nil
 }
+func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
+	// No-op on darwin
+	return "", ""
+}
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -38,12 +38,17 @@
 extern "C" {
 #endif
+#define GPU_ID_LEN 64
 typedef struct mem_info {
+  char *err;  // If non-nill, caller responsible for freeing
+  char gpu_id[GPU_ID_LEN];
  uint64_t total;
  uint64_t free;
-  unsigned int count;
-  int igpu_index; // If >= 0, we detected an integrated GPU to ignore
+  // Compute Capability
-  char *err;  // If non-nill, caller responsible for freeing
+  int major; 
+  int minor;
 } mem_info_t;
 void cpu_check_ram(mem_info_t *resp);
@@ -52,8 +57,8 @@ void cpu_check_ram(mem_info_t *resp);
 }
 #endif
-#include "gpu_info_nvml.h"
 #include "gpu_info_cudart.h"
+#include "gpu_info_nvcuda.h"
 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -8,9 +8,11 @@ void cpu_check_ram(mem_info_t *resp) {
  MEMORYSTATUSEX info;
  info.dwLength = sizeof(info);
  if (GlobalMemoryStatusEx(&info) != 0) {
-    resp->count = 1;
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
+    resp->major = 0;
+    resp->minor = 0;
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  } else {
    resp->err = LOAD_ERR();
  }
@@ -27,9 +29,11 @@ void cpu_check_ram(mem_info_t *resp) {
  if (sysinfo(&info) != 0) {
    resp->err = strdup(strerror(errno));
  } else {
-    resp->count = 1;
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
+    resp->major = 0;
+    resp->minor = 0;
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  }
  return;
 }

--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -6,6 +6,7 @@
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  cudartReturn_t ret;
  resp->err = NULL;
+  resp->num_devices = 0;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
@@ -21,6 +22,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
+      {"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
      {NULL, NULL},
  };
@@ -36,13 +38,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
    return;
  }
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      char *msg = LOAD_ERR();
@@ -63,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
-      resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
+      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
      return;
    }
    snprintf(buf, buflen, "cudart init failure: %d", ret);
@@ -85,110 +81,95 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  }
+  ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
+  if (ret != CUDART_SUCCESS) {
+    LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
 }
-void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
+void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  cudartMemory_t memInfo = {0,0,0};
  cudartReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
-  int i;
  if (h.handle == NULL) {
    resp->err = strdup("cudart handle isn't initialized");
    return;
  }
-  // cudaGetDeviceCount takes int type, resp-> count is uint
+  ret = (*h.cudaSetDevice)(i);
-  int deviceCount;
-  ret = (*h.cudaGetDeviceCount)(&deviceCount);
  if (ret != CUDART_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    snprintf(buf, buflen, "cudart device failed to initialize");
    resp->err = strdup(buf);
    return;
-  } else {
-    resp->count = (unsigned int)deviceCount;
  }
-  resp->total = 0;
+  cudaDeviceProp_t props;
-  resp->free = 0;
+  ret = (*h.cudaGetDeviceProperties)(&props, i);
-  for (i = 0; i < resp-> count; i++) {  
+  if (ret != CUDART_SUCCESS) {
-    ret = (*h.cudaSetDevice)(i);
+    LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
-    if (ret != CUDART_SUCCESS) {
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-      snprintf(buf, buflen, "cudart device failed to initialize");
+    resp->major = 0;
-      resp->err = strdup(buf);
+    resp->minor = 0;
-      return;
+  } else {
+    int allNull = 1;
+    for (int j = 0; j < 16; j++) {
+      if (props.uuid.bytes[j] != 0) {
+        allNull = 0;
+        break;
+      }
    }
-    ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
+    if (allNull != 0) {
-    if (ret != CUDART_SUCCESS) {
+      snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-      snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
+    } else {
-      resp->err = strdup(buf);
+      // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
-      return;
+      snprintf(&resp->gpu_id[0], GPU_ID_LEN,
+          "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+          props.uuid.bytes[0],
+          props.uuid.bytes[1],
+          props.uuid.bytes[2],
+          props.uuid.bytes[3],
+          props.uuid.bytes[4],
+          props.uuid.bytes[5],
+          props.uuid.bytes[6],
+          props.uuid.bytes[7],
+          props.uuid.bytes[8],
+          props.uuid.bytes[9],
+          props.uuid.bytes[10],
+          props.uuid.bytes[11],
+          props.uuid.bytes[12],
+          props.uuid.bytes[13],
+          props.uuid.bytes[14],
+          props.uuid.bytes[15]
+        );
    }
+    resp->major = props.major;
+    resp->minor = props.minor;
-    LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
+    // TODO add other useful properties from props
-    LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
-    resp->total += memInfo.total;
-    resp->free += memInfo.free;
-  }
-}
-void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
-  resp->err = NULL;
-  resp->major = 0;
-  resp->minor = 0;
-  int major = 0;
-  int minor = 0;
-  cudartReturn_t ret;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-  if (h.handle == NULL) {
-    resp->err = strdup("cudart handle not initialized");
-    return;
  }
+  ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
-  int devices;
-  ret = (*h.cudaGetDeviceCount)(&devices);
  if (ret != CUDART_SUCCESS) {
-    snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
+    snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
    resp->err = strdup(buf);
    return;
  }
-  for (i = 0; i < devices; i++) {
+  resp->total = memInfo.total;
-    ret = (*h.cudaSetDevice)(i);
+  resp->free = memInfo.free;
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "cudart device failed to initialize");
-      resp->err = strdup(buf);
-      return;
-    }
-    ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
+  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
-    if (ret != CUDART_SUCCESS) {
+  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
-      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
+  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
-      resp->err = strdup(buf);
-      return;
-    }
-    ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
-      resp->err = strdup(buf);
-      return;
-    }
-    // Report the lowest major.minor we detect as that limits our compatibility
-    if (resp->major == 0 || resp->major > major ) {
-      resp->major = major;
-      resp->minor = minor;
-    } else if ( resp->major == major && resp->minor > minor ) {
-      resp->minor = minor;
-    }
-  }
 }
 void cudart_release(cudart_handle_t h) {

--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVCUDA_H__
+#define __GPU_INFO_NVCUDA_H__
+#include "gpu_info.h"
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum cudaError_enum {
+  CUDA_SUCCESS = 0,
+  CUDA_ERROR_INVALID_VALUE = 1,
+  CUDA_ERROR_MEMORY_ALLOCATION = 2,
+  CUDA_ERROR_NOT_INITIALIZED = 3,
+  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
+  // Other values omitted for now...
+} CUresult;
+typedef enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
+  // TODO - not yet wired up but may be useful for Jetson or other
+  // integrated GPU scenarios with shared memory
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
+} CUdevice_attribute;
+typedef void *nvcudaDevice_t;  // Opaque is sufficient
+typedef struct nvcudaMemory_st {
+  uint64_t total;
+  uint64_t free;
+} nvcudaMemory_t;
+typedef struct nvcudaDriverVersion {
+  int major;
+  int minor;
+} nvcudaDriverVersion_t;
+typedef struct CUuuid_st {
+    unsigned char bytes[16];
+} CUuuid;
+typedef int CUdevice;
+typedef void* CUcontext;
+typedef struct nvcuda_handle {
+  void *handle;
+  uint16_t verbose;
+  CUresult (*cuInit)(unsigned int Flags);
+  CUresult (*cuDriverGetVersion)(int *driverVersion);
+  CUresult (*cuDeviceGetCount)(int *);
+  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
+  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
+  // Context specific aspects
+  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
+  CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
+  CUresult (*cuCtxDestroy)(CUcontext ctx);
+} nvcuda_handle_t;
+typedef struct nvcuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  nvcuda_handle_t ch;
+  int num_devices;
+} nvcuda_init_resp_t;
+void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
+void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_release(nvcuda_handle_t ch);
+#endif  // __GPU_INFO_NVCUDA_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -9,23 +9,16 @@ import (
 func TestBasicGetGPUInfo(t *testing.T) {
 	info := GetGPUInfo()
-	assert.Contains(t, "cuda rocm cpu metal", info.Library)
+	assert.Greater(t, len(info), 0)
+	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
-	switch runtime.GOOS {
+	if info[0].Library != "cpu" {
-	case "darwin":
+		assert.Greater(t, info[0].TotalMemory, uint64(0))
-		// TODO - remove this once MacOS returns some size for CPU
+		assert.Greater(t, info[0].FreeMemory, uint64(0))
-		return
-	case "linux", "windows":
-		assert.Greater(t, info.TotalMemory, uint64(0))
-		assert.Greater(t, info.FreeMemory, uint64(0))
-		assert.Greater(t, info.DeviceCount, uint32(0))
-	default:
-		return
 	}
 }
 func TestCPUMemInfo(t *testing.T) {
-	info, err := getCPUMem()
+	info, err := GetCPUMem()
 	assert.NoError(t, err)
 	switch runtime.GOOS {
 	case "darwin":

--- a/gpu/types.go
+++ b/gpu/types.go
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -4,7 +4,6 @@ package integration
 import (
 	"context"
-	"net/http"
 	"testing"
 	"time"
@@ -25,5 +24,5 @@ func TestContextExhaustion(t *testing.T) {
 			"num_ctx":     128,
 		},
 	}
-	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"once", "upon", "lived"})
+	GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
 }
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go