amd_linux.go 14.5 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
10
package gpu

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"os"
	"path/filepath"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"regexp"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
	"slices"
13
	"sort"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"strconv"
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
16

17
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
18
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
20
21
22
23
24
25
26
27
28
29
)

// Discovery logic for AMD/ROCm GPUs

const (
	DriverVersionFile     = "/sys/module/amdgpu/version"
	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"

	// Prefix with the node dir
	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
30
31

	// Direct Rendering Manager sysfs location
32
	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
33
34
35
36
37
38
39
	DRMTotalMemoryFile = "mem_info_vram_total"
	DRMUsedMemoryFile  = "mem_info_vram_used"

	// In hex; properties file is in decimal
	DRMUniqueIDFile = "unique_id"
	DRMVendorFile   = "vendor"
	DRMDeviceFile   = "device"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
40
41
42
43
)

var (
	// Used to validate if the given ROCm lib is usable
44
45
	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
48
)

// Gather GPU information from the amdgpu driver if any supported GPUs are detected
49
50
func AMDGetGPUInfo() []RocmGPUInfo {
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	if !AMDDetected() {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
		return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
54
55
	}

	// Opportunistic logging of driver version to aid in troubleshooting
Daniel Hiltgen's avatar
Daniel Hiltgen committed
56
57
	driverMajor, driverMinor, err := AMDDriverVersion()
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
	var visibleDevices []string
Michael Yang's avatar
string  
Michael Yang committed
64
65
66
	hipVD := envconfig.HipVisibleDevices()   // zero based index only
	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
	gpuDO := envconfig.GpuDeviceOrdinal()    // zero based index
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
68
69
70
71
72
73
74
75
76
	switch {
	// TODO is this priorty order right?
	case hipVD != "":
		visibleDevices = strings.Split(hipVD, ",")
	case rocrVD != "":
		visibleDevices = strings.Split(rocrVD, ",")
		// TODO - since we don't yet support UUIDs, consider detecting and reporting here
		// all our test systems show GPU-XX indicating UUID is not supported
	case gpuDO != "":
		visibleDevices = strings.Split(gpuDO, ",")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
	}

Michael Yang's avatar
string  
Michael Yang committed
79
	gfxOverride := envconfig.HsaOverrideGfxVersion()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
81
82
83
84
85
	var supported []string
	libDir := ""

	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
86
87
88
89
90
91
92
93
94
95
96
97
98
99
	sort.Slice(matches, func(i, j int) bool {
		// /sys/class/kfd/kfd/topology/nodes/<number>/properties
		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
		if err != nil {
			slog.Debug("parse err", "error", err, "match", matches[i])
			return false
		}
		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
		if err != nil {
			slog.Debug("parse err", "error", err, "match", matches[i])
			return false
		}
		return a < b
	})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
101
102
103
104
105
	cpuCount := 0
	for _, match := range matches {
		slog.Debug("evaluating amdgpu node " + match)
		fp, err := os.Open(match)
		if err != nil {
			slog.Debug("failed to open sysfs node", "file", match, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
107
			continue
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
108
109
110
111
112
		defer fp.Close()
		nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
		if err != nil {
			slog.Debug("failed to parse node ID", "error", err)
			continue
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
115
116
117
		scanner := bufio.NewScanner(fp)
		isCPU := false
		var major, minor, patch uint64
118
		var vendor, device, uniqueID uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
119
120
121
122
123
		for scanner.Scan() {
			line := strings.TrimSpace(scanner.Text())
			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
			if strings.HasPrefix(line, "gfx_target_version") {
				ver := strings.Fields(line)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
124

Daniel Hiltgen's avatar
Daniel Hiltgen committed
125
126
127
128
129
130
				// Detect CPUs
				if len(ver) == 2 && ver[1] == "0" {
					slog.Debug("detected CPU " + match)
					isCPU = true
					break
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
131

Daniel Hiltgen's avatar
Daniel Hiltgen committed
132
133
134
135
136
137
138
139
140
141
142
143
144
145
				if len(ver) != 2 || len(ver[1]) < 5 {
					slog.Warn("malformed "+match, "gfx_target_version", line)
					// If this winds up being a CPU, our offsets may be wrong
					continue
				}
				l := len(ver[1])
				var err1, err2, err3 error
				patch, err1 = strconv.ParseUint(ver[1][l-2:l], 10, 32)
				minor, err2 = strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
				major, err3 = strconv.ParseUint(ver[1][:l-4], 10, 32)
				if err1 != nil || err2 != nil || err3 != nil {
					slog.Debug("malformed int " + line)
					continue
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
147
148
			} else if strings.HasPrefix(line, "vendor_id") {
				ver := strings.Fields(line)
				if len(ver) != 2 {
149
					slog.Debug("malformed", "vendor_id", line)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
150
151
					continue
				}
152
				vendor, err = strconv.ParseUint(ver[1], 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
				if err != nil {
154
					slog.Debug("malformed", "vendor_id", line, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155
156
157
158
				}
			} else if strings.HasPrefix(line, "device_id") {
				ver := strings.Fields(line)
				if len(ver) != 2 {
159
					slog.Debug("malformed", "device_id", line)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
161
					continue
				}
162
				device, err = strconv.ParseUint(ver[1], 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
163
				if err != nil {
164
165
166
167
168
169
170
171
172
173
174
					slog.Debug("malformed", "device_id", line, "error", err)
				}
			} else if strings.HasPrefix(line, "unique_id") {
				ver := strings.Fields(line)
				if len(ver) != 2 {
					slog.Debug("malformed", "unique_id", line)
					continue
				}
				uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
				if err != nil {
					slog.Debug("malformed", "unique_id", line, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
175
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
176
177
178
179
			}
			// TODO - any other properties we want to extract and record?
			// vendor_id + device_id -> pci lookup for "Name"
			// Other metrics that may help us understand relative performance between multiple GPUs
Daniel Hiltgen's avatar
Daniel Hiltgen committed
180
181
		}

182
183
184
185
		// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
		// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
		// do reliably report VRAM usage.

Daniel Hiltgen's avatar
Daniel Hiltgen committed
186
187
188
		if isCPU {
			cpuCount++
			continue
Daniel Hiltgen's avatar
Daniel Hiltgen committed
189
190
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
191
192
		// CPUs are always first in the list
		gpuID := nodeID - cpuCount
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193

Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
195
196
		// Shouldn't happen, but just in case...
		if gpuID < 0 {
			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
197
			return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
198
199
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
200
		if int(major) < RocmComputeMin {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
201
			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
202
203
			continue
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
204
205

		// Look up the memory for the current node
Daniel Hiltgen's avatar
Daniel Hiltgen committed
206
207
		totalMemory := uint64(0)
		usedMemory := uint64(0)
208
		var usedFile string
209
210
211
212
213
214
215
		mapping := []struct {
			id       uint64
			filename string
		}{
			{vendor, DRMVendorFile},
			{device, DRMDeviceFile},
			{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
Daniel Hiltgen's avatar
Daniel Hiltgen committed
216
		}
217
218
219
220
221
222
223
		slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
		// Map over to DRM location to find the total/free memory
		drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
		for _, devDir := range drmMatches {
			matched := true
			for _, m := range mapping {
				if m.id == 0 {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
224
					// Null ID means it didn't populate, so we can't use it to match
225
226
227
					continue
				}
				filename := filepath.Join(devDir, m.filename)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
228
				buf, err := os.ReadFile(filename)
229
230
231
232
233
				if err != nil {
					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
					matched = false
					break
				}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
234
				// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
235
236
237
238
239
240
241
242
243
244
245
246
				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
				if err != nil {
					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
					matched = false
					break
				}
				if cmp != m.id {
					matched = false
					break
				}
			}
			if !matched {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
247
248
				continue
			}
249
250
251
252

			// Found the matching DRM directory
			slog.Debug("matched", "amdgpu", match, "drm", devDir)
			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
253
			buf, err := os.ReadFile(totalFile)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
254
			if err != nil {
255
256
				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
				break
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
			}
258
			totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
259
			if err != nil {
260
261
262
263
				slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
				break
			}

264
265
			usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
			usedMemory, err = getFreeMemory(usedFile)
266
			if err != nil {
267
				slog.Debug("failed to update used memory", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
268
			}
269
			break
Daniel Hiltgen's avatar
Daniel Hiltgen committed
270
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
271
272
273

		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if totalMemory < IGPUMemLimit {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
274
			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
275
276
			continue
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
277
278
279
280
281
		var name string
		// TODO - PCI ID lookup
		if vendor > 0 && device > 0 {
			name = fmt.Sprintf("%04x:%04x", vendor, device)
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
282

Daniel Hiltgen's avatar
Daniel Hiltgen committed
283
284
		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
285
286
287
288
289
290
291
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  (totalMemory - usedMemory),
				},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292
				ID:            strconv.Itoa(gpuID),
293
294
295
296
297
				Name:          name,
				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
				MinimumMemory: rocmMinimumMemory,
				DriverMajor:   driverMajor,
				DriverMinor:   driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
298
			},
299
			usedFilepath: usedFile,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
		}

		// If the user wants to filter to a subset of devices, filter out if we aren't a match
		if len(visibleDevices) > 0 {
			include := false
			for _, visible := range visibleDevices {
				if visible == gpuInfo.ID {
					include = true
					break
				}
			}
			if !include {
				slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
				continue
			}
		}

		// Final validation is gfx compatibility - load the library if we haven't already loaded it
		// even if the user overrides, we still need to validate the library
		if libDir == "" {
			libDir, err = AMDValidateLibDir()
			if err != nil {
				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
				return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
325
326
327
328
329
330
331
332
333
			}
		}
		gpuInfo.DependencyPath = libDir

		if gfxOverride == "" {
			// Only load supported list once
			if len(supported) == 0 {
				supported, err = GetSupportedGFX(libDir)
				if err != nil {
					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
334
					return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
335
336
337
				}
				slog.Debug("rocm supported GPUs", "types", supported)
			}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
338
			gfx := gpuInfo.Compute
Daniel Hiltgen's avatar
Daniel Hiltgen committed
339
340
341
342
343
344
345
346
347
			if !slices.Contains[[]string, string](supported, gfx) {
				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
				// TODO - consider discrete markdown just for ROCM troubleshooting?
				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
				continue
			} else {
				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
			}
		} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
348
			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
350
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
352
353
354
355
		// Check for env var workarounds
		if name == "1002:687f" { // Vega RX 56
			gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
356
357
		// The GPU has passed all the verification steps and is supported
		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
358
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
359
360
	if len(resp) == 0 {
		slog.Info("no compatible amdgpu devices detected")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
361
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
362
	return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
363
364
365
366
367
368
369
370
371
372
373
}

// Quick check for AMD driver so we can skip amdgpu discovery if not present
func AMDDetected() bool {
	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
	sysfsDir := filepath.Dir(DriverVersionFile)
	_, err := os.Stat(sysfsDir)
	if errors.Is(err, os.ErrNotExist) {
		slog.Debug("amdgpu driver not detected " + sysfsDir)
		return false
	} else if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
374
		slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
375
376
377
378
379
380
381
382
		return false
	}
	return true
}

// Prefer to use host installed ROCm, as long as it meets our minimum requirements
// failing that, tell the user how to download it on their own
func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
383
	libDir, err := commonAMDValidateLibDir()
384
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
385
		return libDir, nil
386
387
	}

388
389
390
	// Well known ollama installer path
	installedRocmDir := "/usr/share/ollama/lib/rocm"
	if rocmLibUsable(installedRocmDir) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
		return installedRocmDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
393
	}

394
395
	// If we still haven't found a usable rocm, the user will have to install it on their own
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
Michael Yang's avatar
lint  
Michael Yang committed
396
	return "", errors.New("no suitable rocm found, falling back to CPU")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
397
398
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
399
400
func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
	_, err = os.Stat(DriverVersionFile)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
401
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
402
		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
403
404
405
	}
	fp, err := os.Open(DriverVersionFile)
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
406
		return 0, 0, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
407
408
409
410
	}
	defer fp.Close()
	verString, err := io.ReadAll(fp)
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
		return 0, 0, err
	}

	pattern := `\A(\d+)\.(\d+).*`
	regex := regexp.MustCompile(pattern)
	match := regex.FindStringSubmatch(string(verString))
	if len(match) < 2 {
		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
	}
	driverMajor, err = strconv.Atoi(match[1])
	if err != nil {
		return 0, 0, err
	}
	driverMinor, err = strconv.Atoi(match[2])
	if err != nil {
		return 0, 0, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
427
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
428
	return driverMajor, driverMinor, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
429
}
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	for i := range gpus {
		usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
		if err != nil {
			return err
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
		gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
	}
	return nil
}

func getFreeMemory(usedFile string) (uint64, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
447
	buf, err := os.ReadFile(usedFile)
448
449
450
451
452
453
454
455
456
457
	if err != nil {
		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
	}
	usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
	if err != nil {
		slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
		return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
	}
	return usedMemory, nil
}