amd_windows.go 5.94 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
package gpu

import (
	"bytes"
	"fmt"
	"log/slog"
	"os"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
10
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12

13
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
15
16
17
18
19
20
21
22
23
24
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
)

29
30
func AMDGetGPUInfo() []RocmGPUInfo {
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
32
33
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
37
	}
	defer hl.Release()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
38
39
40
41
42
43
	// TODO - this reports incorrect version information, so omitting for now
	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
	// if err != nil {
	// 	// For now this is benign, but we may eventually need to fail compatibility checks
	// 	slog.Debug("error looking up amd driver version", "error", err)
	// }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44

Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
	count := hl.HipGetDeviceCount()
	if count == 0 {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
51
	}
	libDir, err := AMDValidateLibDir()
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
55
56
	}

	var supported []string
57
	gfxOverride := envconfig.HsaOverrideGfxVersion
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
59
60
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
64
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
67
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Michael Yang's avatar
Michael Yang committed
70
	for i := range count {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
72
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
75
76
77
78
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
81
82
83
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
85
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
87
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
90
91
		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
		// TODO  Why isn't props.iGPU accurate!?
		if strings.EqualFold(name, iGPUName) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
94
95
96
			continue
		}
		if gfxOverride == "" {
			if !slices.Contains[[]string, string](supported, gfx) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
100
101
				// TODO - consider discrete markdown just for ROCM troubleshooting?
				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
				continue
			} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
105
			}
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
108
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
109
110
111
			continue
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
112
113
114
115
116
117
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if totalMemory < IGPUMemLimit {
			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
			continue
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
118
119
		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
120
121
122
123
124
125
126
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  freeMemory,
				},
127
128
129
				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
				UnreliableFreeMemory: true,

Daniel Hiltgen's avatar
Daniel Hiltgen committed
130
				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
131
132
133
134
135
136
137
138
				DependencyPath: libDir,
				MinimumMemory:  rocmMinimumMemory,
				Name:           name,
				Compute:        gfx,

				// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
				// DriverMajor:    driverMajor,
				// DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
139
			},
140
			index: i,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
141
142
143
		}

		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
146

	return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
148
149
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
150
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
151
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
154
	}

155
156
157
158
	// Installer payload (if we're running from some other location)
	localAppData := os.Getenv("LOCALAPPDATA")
	appDir := filepath.Join(localAppData, "Programs", "Ollama")
	rocmTargetDir := filepath.Join(appDir, "rocm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
	if rocmLibUsable(rocmTargetDir) {
160
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
162
163
164
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
165
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166
167
	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
		return nil
	}
	defer hl.Release()

	for i := range gpus {
		err := hl.HipSetDevice(gpus[i].index)
		if err != nil {
			return err
		}
		freeMemory, _, err := hl.HipMemGetInfo()
		if err != nil {
			slog.Warn("get mem info", "id", i, "error", err)
			continue
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
		gpus[i].FreeMemory = freeMemory
	}
	return nil
}