amd_windows.go 6.56 KB
Newer Older
1
package discover
Daniel Hiltgen's avatar
Daniel Hiltgen committed
2
3
4

import (
	"bytes"
Michael Yang's avatar
lint  
Michael Yang committed
5
	"errors"
6
	"fmt"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
	"log/slog"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
10
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12

13
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
15
16
17
18
19
20
21
22
23
24
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
)

29
30
// Only called once during bootstrap
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
31
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
33
34
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
35
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
38
	}
	defer hl.Release()

39
40
41
42
43
	driverMajor, driverMinor, err := hl.AMDDriverVersion()
	if err != nil {
		// For now this is benign, but we may eventually need to fail compatibility checks
		slog.Debug("error looking up amd driver version", "error", err)
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44

45
	// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
	count := hl.HipGetDeviceCount()
	if count == 0 {
48
49
50
		err := fmt.Errorf("no compatible amdgpu devices detected")
		slog.Info(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	}
Michael Yang's avatar
Michael Yang committed
52

Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
54
	libDir, err := AMDValidateLibDir()
	if err != nil {
55
56
57
		err = fmt.Errorf("unable to verify rocm library: %w", err)
		slog.Warn(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
59
60
	}

	var supported []string
Michael Yang's avatar
string  
Michael Yang committed
61
	gfxOverride := envconfig.HsaOverrideGfxVersion()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
64
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
65
66
67
			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
			slog.Warn(err.Error())
			return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
72
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Michael Yang's avatar
Michael Yang committed
75
	for i := range count {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
77
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
81
82
83
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
86
87
88
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
90
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Michael Yang's avatar
lint  
Michael Yang committed
94
		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
		// TODO  Why isn't props.iGPU accurate!?

Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
101
102
			continue
		}

103
104
105
106
107
108
109
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  freeMemory,
				},
110
111
112
				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
				UnreliableFreeMemory: true,

Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
Michael Yang's avatar
Michael Yang committed
114
				DependencyPath: []string{libDir},
115
116
117
				MinimumMemory:  rocmMinimumMemory,
				Name:           name,
				Compute:        gfx,
118
119
				DriverMajor:    driverMajor,
				DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
			},
121
			index: i,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
122
123
		}

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
			reason := "unsupported Radeon iGPU detected skipping"
			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			continue
		}

		// Strip off Target Features when comparing
		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			// HSA_OVERRIDE_GFX_VERSION not supported on windows
			continue
		} else {
			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
		}

		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))

Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154

155
	return resp, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
156
157
158
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162
163
	}

164
	// Installer payload (if we're running from some other location)
Michael Yang's avatar
Michael Yang committed
165
	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166
	if rocmLibUsable(rocmTargetDir) {
167
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
168
169
170
171
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
172
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Michael Yang's avatar
lint  
Michael Yang committed
173
	return "", errors.New("no suitable rocm found, falling back to CPU")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
174
}
175
176
177
178
179
180
181
182

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
183
		return err
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
	}
	defer hl.Release()

	for i := range gpus {
		err := hl.HipSetDevice(gpus[i].index)
		if err != nil {
			return err
		}
		freeMemory, _, err := hl.HipMemGetInfo()
		if err != nil {
			slog.Warn("get mem info", "id", i, "error", err)
			continue
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
		gpus[i].FreeMemory = freeMemory
	}
	return nil
}
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
	ids := []string{}
	for _, info := range gpuInfo {
		if info.Library != "rocm" {
			// TODO shouldn't happen if things are wired correctly...
			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
			continue
		}
		ids = append(ids, info.ID)
	}
	// There are 3 potential env vars to use to select GPUs.
	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
	// HIP_VISIBLE_DEVICES supports numeric IDs only
	// GPU_DEVICE_ORDINAL supports numeric IDs only
	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
}