amd_windows.go 5.96 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
package gpu

import (
	"bytes"
	"fmt"
	"log/slog"
	"os"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
10
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
13

	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
16
17
18
19
20
21
22
23
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
24
25
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
)

28
29
func AMDGetGPUInfo() []RocmGPUInfo {
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
30
31
32
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
35
36
	}
	defer hl.Release()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
37
38
39
40
41
42
	// TODO - this reports incorrect version information, so omitting for now
	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
	// if err != nil {
	// 	// For now this is benign, but we may eventually need to fail compatibility checks
	// 	slog.Debug("error looking up amd driver version", "error", err)
	// }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
43

Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
46
	count := hl.HipGetDeviceCount()
	if count == 0 {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
49
50
	}
	libDir, err := AMDValidateLibDir()
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
52
		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
54
55
56
57
58
59
	}

	var supported []string
	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
66
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Michael Yang's avatar
Michael Yang committed
69
	for i := range count {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
71
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
74
75
76
77
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
81
82
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
83
84
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
86
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
89
90
		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
		// TODO  Why isn't props.iGPU accurate!?
		if strings.EqualFold(name, iGPUName) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
94
95
			continue
		}
		if gfxOverride == "" {
			if !slices.Contains[[]string, string](supported, gfx) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
98
99
100
				// TODO - consider discrete markdown just for ROCM troubleshooting?
				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
				continue
			} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
103
104
			}
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
105
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
108
109
110
			continue
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
111
112
113
114
115
116
117
118
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if totalMemory < IGPUMemLimit {
			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
			continue
		}

		// TODO revisit this once ROCm v6 is available on windows.
		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
119
120
		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
121
122
123
124
125
126
127
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  freeMemory,
				},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
128
				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
129
130
131
132
133
134
135
136
				DependencyPath: libDir,
				MinimumMemory:  rocmMinimumMemory,
				Name:           name,
				Compute:        gfx,

				// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
				// DriverMajor:    driverMajor,
				// DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
			},
138
			index: i,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
139
140
141
		}

		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
142
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
143
144

	return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
146
147
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
149
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
150
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
151
152
	}

153
154
155
156
	// Installer payload (if we're running from some other location)
	localAppData := os.Getenv("LOCALAPPDATA")
	appDir := filepath.Join(localAppData, "Programs", "Ollama")
	rocmTargetDir := filepath.Join(appDir, "rocm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
157
	if rocmLibUsable(rocmTargetDir) {
158
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
160
161
162
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
163
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
164
165
	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
		return nil
	}
	defer hl.Release()

	for i := range gpus {
		err := hl.HipSetDevice(gpus[i].index)
		if err != nil {
			return err
		}
		freeMemory, _, err := hl.HipMemGetInfo()
		if err != nil {
			slog.Warn("get mem info", "id", i, "error", err)
			continue
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
		gpus[i].FreeMemory = freeMemory
	}
	return nil
}