amd_linux.go 14.2 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
10
package gpu

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"os"
	"path/filepath"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"regexp"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
13
14
	"slices"
	"strconv"
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
15
16

	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
18
19
20
21
22
23
24
25
26
27
)

// Discovery logic for AMD/ROCm GPUs

const (
	DriverVersionFile     = "/sys/module/amdgpu/version"
	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"

	// Prefix with the node dir
	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
28
29

	// Direct Rendering Manager sysfs location
30
	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
31
32
33
34
35
36
37
	DRMTotalMemoryFile = "mem_info_vram_total"
	DRMUsedMemoryFile  = "mem_info_vram_used"

	// In hex; properties file is in decimal
	DRMUniqueIDFile = "unique_id"
	DRMVendorFile   = "vendor"
	DRMDeviceFile   = "device"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
38
39
40
41
)

var (
	// Used to validate if the given ROCm lib is usable
42
43
	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
45
46
)

// Gather GPU information from the amdgpu driver if any supported GPUs are detected
47
48
func AMDGetGPUInfo() []RocmGPUInfo {
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
	if !AMDDetected() {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
		return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
52
53
	}

	// Opportunistic logging of driver version to aid in troubleshooting
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
55
	driverMajor, driverMinor, err := AMDDriverVersion()
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
56
		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
59
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
	var visibleDevices []string
	hipVD := os.Getenv("HIP_VISIBLE_DEVICES")   // zero based index only
	rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
	gpuDO := os.Getenv("GPU_DEVICE_ORDINAL")    // zero based index
	switch {
	// TODO is this priorty order right?
	case hipVD != "":
		visibleDevices = strings.Split(hipVD, ",")
	case rocrVD != "":
		visibleDevices = strings.Split(rocrVD, ",")
		// TODO - since we don't yet support UUIDs, consider detecting and reporting here
		// all our test systems show GPU-XX indicating UUID is not supported
	case gpuDO != "":
		visibleDevices = strings.Split(gpuDO, ",")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
76
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
79
80
81
82
83
84
85
86
87
88
89
	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
	var supported []string
	libDir := ""

	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
	cpuCount := 0
	for _, match := range matches {
		slog.Debug("evaluating amdgpu node " + match)
		fp, err := os.Open(match)
		if err != nil {
			slog.Debug("failed to open sysfs node", "file", match, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
91
			continue
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
94
95
96
		defer fp.Close()
		nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
		if err != nil {
			slog.Debug("failed to parse node ID", "error", err)
			continue
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
98
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
100
101
		scanner := bufio.NewScanner(fp)
		isCPU := false
		var major, minor, patch uint64
102
		var vendor, device, uniqueID uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
105
106
107
		for scanner.Scan() {
			line := strings.TrimSpace(scanner.Text())
			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
			if strings.HasPrefix(line, "gfx_target_version") {
				ver := strings.Fields(line)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
108

Daniel Hiltgen's avatar
Daniel Hiltgen committed
109
110
111
112
113
114
				// Detect CPUs
				if len(ver) == 2 && ver[1] == "0" {
					slog.Debug("detected CPU " + match)
					isCPU = true
					break
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
115

Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
				if len(ver) != 2 || len(ver[1]) < 5 {
					slog.Warn("malformed "+match, "gfx_target_version", line)
					// If this winds up being a CPU, our offsets may be wrong
					continue
				}
				l := len(ver[1])
				var err1, err2, err3 error
				patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
				minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
				major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
				if err1 != nil || err2 != nil || err3 != nil {
					slog.Debug("malformed int " + line)
					continue
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
130
131
132
			} else if strings.HasPrefix(line, "vendor_id") {
				ver := strings.Fields(line)
				if len(ver) != 2 {
133
					slog.Debug("malformed", "vendor_id", line)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
134
135
					continue
				}
136
				vendor, err = strconv.ParseUint(ver[1], 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
				if err != nil {
138
					slog.Debug("malformed", "vendor_id", line, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
139
140
141
142
				}
			} else if strings.HasPrefix(line, "device_id") {
				ver := strings.Fields(line)
				if len(ver) != 2 {
143
					slog.Debug("malformed", "device_id", line)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
145
					continue
				}
146
				device, err = strconv.ParseUint(ver[1], 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
				if err != nil {
148
149
150
151
152
153
154
155
156
157
158
					slog.Debug("malformed", "device_id", line, "error", err)
				}
			} else if strings.HasPrefix(line, "unique_id") {
				ver := strings.Fields(line)
				if len(ver) != 2 {
					slog.Debug("malformed", "unique_id", line)
					continue
				}
				uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
				if err != nil {
					slog.Debug("malformed", "unique_id", line, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
161
162
163
			}
			// TODO - any other properties we want to extract and record?
			// vendor_id + device_id -> pci lookup for "Name"
			// Other metrics that may help us understand relative performance between multiple GPUs
Daniel Hiltgen's avatar
Daniel Hiltgen committed
164
165
		}

166
167
168
169
		// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
		// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
		// do reliably report VRAM usage.

Daniel Hiltgen's avatar
Daniel Hiltgen committed
170
171
172
		if isCPU {
			cpuCount++
			continue
Daniel Hiltgen's avatar
Daniel Hiltgen committed
173
174
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
175
176
		// CPUs are always first in the list
		gpuID := nodeID - cpuCount
Daniel Hiltgen's avatar
Daniel Hiltgen committed
177

Daniel Hiltgen's avatar
Daniel Hiltgen committed
178
179
180
		// Shouldn't happen, but just in case...
		if gpuID < 0 {
			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
181
			return []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
182
183
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
184
		if int(major) < RocmComputeMin {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
185
			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
186
187
			continue
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
188
189

		// Look up the memory for the current node
Daniel Hiltgen's avatar
Daniel Hiltgen committed
190
191
		totalMemory := uint64(0)
		usedMemory := uint64(0)
192
		var usedFile string
193
194
195
196
197
198
199
		mapping := []struct {
			id       uint64
			filename string
		}{
			{vendor, DRMVendorFile},
			{device, DRMDeviceFile},
			{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
Daniel Hiltgen's avatar
Daniel Hiltgen committed
200
		}
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
		slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
		// Map over to DRM location to find the total/free memory
		drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
		for _, devDir := range drmMatches {
			matched := true
			for _, m := range mapping {
				if m.id == 0 {
					continue
				}
				filename := filepath.Join(devDir, m.filename)
				fp, err := os.Open(filename)
				if err != nil {
					slog.Debug("failed to open sysfs node", "file", filename, "error", err)
					matched = false
					break
				}
				defer fp.Close()
				buf, err := io.ReadAll(fp)
				if err != nil {
					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
					matched = false
					break
				}
				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
				if err != nil {
					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
					matched = false
					break
				}
				if cmp != m.id {
					matched = false
					break
				}
			}
			if !matched {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
236
237
				continue
			}
238
239
240
241
242
243
244
245

			// Found the matching DRM directory
			slog.Debug("matched", "amdgpu", match, "drm", devDir)
			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
			totalFp, err := os.Open(totalFile)
			if err != nil {
				slog.Debug("failed to open sysfs node", "file", totalFile, "error", err)
				break
Daniel Hiltgen's avatar
Daniel Hiltgen committed
246
			}
247
248
			defer totalFp.Close()
			buf, err := io.ReadAll(totalFp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
249
			if err != nil {
250
251
				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
				break
Daniel Hiltgen's avatar
Daniel Hiltgen committed
252
			}
253
			totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
254
			if err != nil {
255
256
257
258
				slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
				break
			}

259
260
			usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
			usedMemory, err = getFreeMemory(usedFile)
261
			if err != nil {
262
				slog.Debug("failed to update used memory", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
263
			}
264
			break
Daniel Hiltgen's avatar
Daniel Hiltgen committed
265
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
266
267
268

		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if totalMemory < IGPUMemLimit {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
269
			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
270
271
			continue
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
272
273
274
275
276
		var name string
		// TODO - PCI ID lookup
		if vendor > 0 && device > 0 {
			name = fmt.Sprintf("%04x:%04x", vendor, device)
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
277

Daniel Hiltgen's avatar
Daniel Hiltgen committed
278
279
		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
280
281
282
283
284
285
286
287
288
289
290
291
292
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  (totalMemory - usedMemory),
				},
				ID:            fmt.Sprintf("%d", gpuID),
				Name:          name,
				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
				MinimumMemory: rocmMinimumMemory,
				DriverMajor:   driverMajor,
				DriverMinor:   driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
293
			},
294
			usedFilepath: usedFile,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
		}

		// If the user wants to filter to a subset of devices, filter out if we aren't a match
		if len(visibleDevices) > 0 {
			include := false
			for _, visible := range visibleDevices {
				if visible == gpuInfo.ID {
					include = true
					break
				}
			}
			if !include {
				slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
				continue
			}
		}

		// Final validation is gfx compatibility - load the library if we haven't already loaded it
		// even if the user overrides, we still need to validate the library
		if libDir == "" {
			libDir, err = AMDValidateLibDir()
			if err != nil {
				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
318
				return []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
319
320
321
322
323
324
325
326
327
328
			}
		}
		gpuInfo.DependencyPath = libDir

		if gfxOverride == "" {
			// Only load supported list once
			if len(supported) == 0 {
				supported, err = GetSupportedGFX(libDir)
				if err != nil {
					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
329
					return []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
330
331
332
				}
				slog.Debug("rocm supported GPUs", "types", supported)
			}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
333
			gfx := gpuInfo.Compute
Daniel Hiltgen's avatar
Daniel Hiltgen committed
334
335
336
337
338
339
340
341
342
			if !slices.Contains[[]string, string](supported, gfx) {
				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
				// TODO - consider discrete markdown just for ROCM troubleshooting?
				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
				continue
			} else {
				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
			}
		} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
343
			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
344
345
346
347
		}

		// The GPU has passed all the verification steps and is supported
		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
348
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
350
	if len(resp) == 0 {
		slog.Info("no compatible amdgpu devices detected")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
352
	return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
354
355
356
357
358
359
360
361
362
363
}

// Quick check for AMD driver so we can skip amdgpu discovery if not present
func AMDDetected() bool {
	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
	sysfsDir := filepath.Dir(DriverVersionFile)
	_, err := os.Stat(sysfsDir)
	if errors.Is(err, os.ErrNotExist) {
		slog.Debug("amdgpu driver not detected " + sysfsDir)
		return false
	} else if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
364
		slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
365
366
367
368
369
370
371
372
		return false
	}
	return true
}

// Prefer to use host installed ROCm, as long as it meets our minimum requirements
// failing that, tell the user how to download it on their own
func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
373
	libDir, err := commonAMDValidateLibDir()
374
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
375
		return libDir, nil
376
377
	}

378
379
380
	// Well known ollama installer path
	installedRocmDir := "/usr/share/ollama/lib/rocm"
	if rocmLibUsable(installedRocmDir) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
381
		return installedRocmDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
382
383
	}

384
385
	// If we still haven't found a usable rocm, the user will have to install it on their own
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
386
387
388
	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
390
func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
	_, err = os.Stat(DriverVersionFile)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
393
394
395
	}
	fp, err := os.Open(DriverVersionFile)
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
396
		return 0, 0, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
397
398
399
400
	}
	defer fp.Close()
	verString, err := io.ReadAll(fp)
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
		return 0, 0, err
	}

	pattern := `\A(\d+)\.(\d+).*`
	regex := regexp.MustCompile(pattern)
	match := regex.FindStringSubmatch(string(verString))
	if len(match) < 2 {
		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
	}
	driverMajor, err = strconv.Atoi(match[1])
	if err != nil {
		return 0, 0, err
	}
	driverMinor, err = strconv.Atoi(match[2])
	if err != nil {
		return 0, 0, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
417
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
418
	return driverMajor, driverMinor, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
419
}
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	for i := range gpus {
		usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
		if err != nil {
			return err
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
		gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
	}
	return nil
}

func getFreeMemory(usedFile string) (uint64, error) {
	usedFp, err := os.Open(usedFile)
	if err != nil {
		return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
	}
	defer usedFp.Close()
	buf, err := io.ReadAll(usedFp)
	if err != nil {
		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
	}
	usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
	if err != nil {
		slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
		return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
	}
	return usedMemory, nil
}