amd_windows.go 6.75 KB
Newer Older
1
package discover
Daniel Hiltgen's avatar
Daniel Hiltgen committed
2
3
4

import (
	"bytes"
Michael Yang's avatar
lint  
Michael Yang committed
5
	"errors"
6
	"fmt"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
	"log/slog"
	"os"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
13

14
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
15
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
16
17
18
19
20
21
22
23
24
25
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
28
29
)

30
31
// Only called once during bootstrap
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
32
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
34
35
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
36
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
37
38
39
	}
	defer hl.Release()

40
41
42
43
44
	driverMajor, driverMinor, err := hl.AMDDriverVersion()
	if err != nil {
		// For now this is benign, but we may eventually need to fail compatibility checks
		slog.Debug("error looking up amd driver version", "error", err)
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45

46
	// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
48
	count := hl.HipGetDeviceCount()
	if count == 0 {
49
50
51
		err := fmt.Errorf("no compatible amdgpu devices detected")
		slog.Info(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
	}
53
	depPaths := LibraryDirs()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
55
	libDir, err := AMDValidateLibDir()
	if err != nil {
56
57
58
		err = fmt.Errorf("unable to verify rocm library: %w", err)
		slog.Warn(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	}
60
	depPaths = append(depPaths, libDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62

	var supported []string
Michael Yang's avatar
string  
Michael Yang committed
63
	gfxOverride := envconfig.HsaOverrideGfxVersion()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
65
66
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
67
68
69
			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
			slog.Warn(err.Error())
			return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
71
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
74
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Michael Yang's avatar
Michael Yang committed
77
	for i := range count {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
81
82
83
84
85
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
88
89
90
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
94
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Michael Yang's avatar
lint  
Michael Yang committed
96
		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
98
		// TODO  Why isn't props.iGPU accurate!?

Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
103
104
			continue
		}

105
106
107
108
109
110
111
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  freeMemory,
				},
112
113
114
				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
				UnreliableFreeMemory: true,

Daniel Hiltgen's avatar
Daniel Hiltgen committed
115
				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
116
				DependencyPath: depPaths,
117
118
119
				MinimumMemory:  rocmMinimumMemory,
				Name:           name,
				Compute:        gfx,
120
121
				DriverMajor:    driverMajor,
				DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
122
			},
123
			index: i,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
124
125
		}

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
			reason := "unsupported Radeon iGPU detected skipping"
			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			continue
		}

		// Strip off Target Features when comparing
		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			// HSA_OVERRIDE_GFX_VERSION not supported on windows
			continue
		} else {
			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
		}

		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))

Daniel Hiltgen's avatar
Daniel Hiltgen committed
154
		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
156

157
	return resp, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
158
159
160
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
163
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
164
165
	}

166
167
168
	// Installer payload (if we're running from some other location)
	localAppData := os.Getenv("LOCALAPPDATA")
	appDir := filepath.Join(localAppData, "Programs", "Ollama")
169
	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170
	if rocmLibUsable(rocmTargetDir) {
171
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
172
173
174
175
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
176
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Michael Yang's avatar
lint  
Michael Yang committed
177
	return "", errors.New("no suitable rocm found, falling back to CPU")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
178
}
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
		return nil
	}
	defer hl.Release()

	for i := range gpus {
		err := hl.HipSetDevice(gpus[i].index)
		if err != nil {
			return err
		}
		freeMemory, _, err := hl.HipMemGetInfo()
		if err != nil {
			slog.Warn("get mem info", "id", i, "error", err)
			continue
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
		gpus[i].FreeMemory = freeMemory
	}
	return nil
}
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
	ids := []string{}
	for _, info := range gpuInfo {
		if info.Library != "rocm" {
			// TODO shouldn't happen if things are wired correctly...
			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
			continue
		}
		ids = append(ids, info.ID)
	}
	// There are 3 potential env vars to use to select GPUs.
	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
	// HIP_VISIBLE_DEVICES supports numeric IDs only
	// GPU_DEVICE_ORDINAL supports numeric IDs only
	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
}