amd_windows.go 5.82 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
package gpu

import (
	"bytes"
	"fmt"
	"log/slog"
	"os"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
10
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
13

	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
16
17
18
19
20
21
22
23
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
24
25
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
)

Daniel Hiltgen's avatar
Daniel Hiltgen committed
28
29
func AMDGetGPUInfo() []GpuInfo {
	resp := []GpuInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
30
31
32
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
35
36
37
38
39
40
41
	}
	defer hl.Release()

	ver, err := hl.AMDDriverVersion()
	if err == nil {
		slog.Info("AMD Driver: " + ver)
	} else {
		// For now this is benign, but we may eventually need to fail compatibility checks
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
		slog.Debug("error looking up amd driver version", "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
43
44
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
	count := hl.HipGetDeviceCount()
	if count == 0 {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
51
	}
	libDir, err := AMDValidateLibDir()
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
55
56
57
58
59
60
	}

	var supported []string
	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
64
65
66
67
		}
	} else {
		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
	slog.Info("detected hip devices", "count", count)
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
71
72
	for i := 0; i < count; i++ {
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
75
76
77
78
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
81
82
83
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
85
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
87
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
89
90
91
92
93
94
95
		slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
		var major, minor, patch string
		switch len(gfx) {
		case 6:
			major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
		case 7:
			major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
		}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
98
		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
		// TODO  Why isn't props.iGPU accurate!?
		if strings.EqualFold(name, iGPUName) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
			slog.Info("iGPU detected skipping", "id", i)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
101
102
103
			continue
		}
		if gfxOverride == "" {
			if !slices.Contains[[]string, string](supported, gfx) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
105
106
107
108
				// TODO - consider discrete markdown just for ROCM troubleshooting?
				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
				continue
			} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
109
				slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
110
111
112
			}
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
114
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
115
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
118
			continue
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if totalMemory < IGPUMemLimit {
			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
			continue
		}

		// TODO revisit this once ROCm v6 is available on windows.
		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
		slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
		gpuInfo := GpuInfo{
			Library: "rocm",
			memInfo: memInfo{
				TotalMemory: totalMemory,
				FreeMemory:  freeMemory,
			},
			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
			DependencyPath: libDir,
			MinimumMemory:  rocmMinimumMemory,
		}
		if major != "" {
			gpuInfo.Major, err = strconv.Atoi(major)
			if err != nil {
				slog.Info("failed to parse version", "version", gfx, "error", err)
			}
		}
		if minor != "" {
			gpuInfo.Minor, err = strconv.Atoi(minor)
			if err != nil {
				slog.Info("failed to parse version", "version", gfx, "error", err)
			}
		}
		if patch != "" {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
153
			// Patch rev is hex; e.g. gfx90a
			p, err := strconv.ParseInt(patch, 16, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154
155
			if err != nil {
				slog.Info("failed to parse version", "version", gfx, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
156
157
			} else {
				gpuInfo.Patch = int(p)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
158
159
160
			}
		}
		if gpuInfo.Major < RocmComputeMin {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162
163
164
165
			continue
		}

		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
167
168

	return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
169
170
171
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
172
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
173
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
174
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
175
176
	}

177
178
179
180
	// Installer payload (if we're running from some other location)
	localAppData := os.Getenv("LOCALAPPDATA")
	appDir := filepath.Join(localAppData, "Programs", "Ollama")
	rocmTargetDir := filepath.Join(appDir, "rocm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
181
	if rocmLibUsable(rocmTargetDir) {
182
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
183
184
185
186
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
187
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
188
189
	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}