amd_windows.go 6.67 KB
Newer Older
1
package discover
Daniel Hiltgen's avatar
Daniel Hiltgen committed
2
3
4

import (
	"bytes"
Michael Yang's avatar
lint  
Michael Yang committed
5
	"errors"
6
	"fmt"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
	"log/slog"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
10
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12

13
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
15
16
17
18
19
20
21
22
23
24
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
)

29
30
// Only called once during bootstrap
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
31
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
33
34
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
35
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
38
	}
	defer hl.Release()

39
40
41
42
43
	driverMajor, driverMinor, err := hl.AMDDriverVersion()
	if err != nil {
		// For now this is benign, but we may eventually need to fail compatibility checks
		slog.Debug("error looking up amd driver version", "error", err)
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44

45
	// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
	count := hl.HipGetDeviceCount()
	if count == 0 {
48
49
50
		err := fmt.Errorf("no compatible amdgpu devices detected")
		slog.Info(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	}
Michael Yang's avatar
Michael Yang committed
52

Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
54
	libDir, err := AMDValidateLibDir()
	if err != nil {
55
56
57
		err = fmt.Errorf("unable to verify rocm library: %w", err)
		slog.Warn(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
59
60
	}

	var supported []string
Michael Yang's avatar
string  
Michael Yang committed
61
	gfxOverride := envconfig.HsaOverrideGfxVersion()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
64
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
65
66
67
			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
			slog.Warn(err.Error())
			return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
72
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Michael Yang's avatar
Michael Yang committed
75
	for i := range count {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
77
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
81
82
83
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
86
87
88
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
90
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Michael Yang's avatar
lint  
Michael Yang committed
94
		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
		// TODO  Why isn't props.iGPU accurate!?

Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
101
102
			continue
		}

103
104
105
106
107
108
109
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  freeMemory,
				},
110
111
112
				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
				UnreliableFreeMemory: true,

Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
114
				filterID:       i,
Michael Yang's avatar
Michael Yang committed
115
				DependencyPath: []string{libDir},
116
117
118
				MinimumMemory:  rocmMinimumMemory,
				Name:           name,
				Compute:        gfx,
119
120
				DriverMajor:    driverMajor,
				DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
121
			},
122
			index: i,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
123
124
		}

125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
			reason := "unsupported Radeon iGPU detected skipping"
			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			continue
		}

		// Strip off Target Features when comparing
		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			// HSA_OVERRIDE_GFX_VERSION not supported on windows
			continue
		} else {
			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
		}

		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))

Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155

156
	return resp, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
157
158
159
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
163
164
	}

165
	// Installer payload (if we're running from some other location)
Michael Yang's avatar
Michael Yang committed
166
	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
167
	if rocmLibUsable(rocmTargetDir) {
168
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
169
170
171
172
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
173
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Michael Yang's avatar
lint  
Michael Yang committed
174
	return "", errors.New("no suitable rocm found, falling back to CPU")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
175
}
176
177
178
179
180
181
182
183

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
184
		return err
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
	}
	defer hl.Release()

	for i := range gpus {
		err := hl.HipSetDevice(gpus[i].index)
		if err != nil {
			return err
		}
		freeMemory, _, err := hl.HipMemGetInfo()
		if err != nil {
			slog.Warn("get mem info", "id", i, "error", err)
			continue
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
		gpus[i].FreeMemory = freeMemory
	}
	return nil
}
203

204
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
205
206
207
208
209
	ids := []string{}
	for _, info := range gpuInfo {
		if info.Library != "rocm" {
			continue
		}
210
211
212
213
214
215
		// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
		if _, err := strconv.Atoi(info.ID); err == nil {
			ids = append(ids, fmt.Sprintf("%d", info.filterID))
		} else {
			ids = append(ids, info.ID)
		}
216
	}
217
218
219
220
	if len(ids) == 0 {
		return ""
	}

221
222
223
224
	// There are 3 potential env vars to use to select GPUs.
	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
	// HIP_VISIBLE_DEVICES supports numeric IDs only
	// GPU_DEVICE_ORDINAL supports numeric IDs only
225
	return "HIP_VISIBLE_DEVICES=" + strings.Join(ids, ",")
226
}