amd_windows.go 6.05 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
package gpu

import (
	"bytes"
Michael Yang's avatar
lint  
Michael Yang committed
5
	"errors"
6
	"fmt"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
	"log/slog"
	"os"
	"path/filepath"
	"slices"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
	"strconv"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
13

14
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
15
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
16
17
18
19
20
21
22
23
24
25
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
28
29
)

30
31
// Only called once during bootstrap
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
32
	resp := []RocmGPUInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
34
35
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
36
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
37
38
39
	}
	defer hl.Release()

40
41
42
43
44
	driverMajor, driverMinor, err := hl.AMDDriverVersion()
	if err != nil {
		// For now this is benign, but we may eventually need to fail compatibility checks
		slog.Debug("error looking up amd driver version", "error", err)
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45

Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
48
	count := hl.HipGetDeviceCount()
	if count == 0 {
49
50
51
		err := fmt.Errorf("no compatible amdgpu devices detected")
		slog.Info(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
54
	}
	libDir, err := AMDValidateLibDir()
	if err != nil {
55
56
57
		err = fmt.Errorf("unable to verify rocm library: %w", err)
		slog.Warn(err.Error())
		return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
59
60
	}

	var supported []string
Michael Yang's avatar
string  
Michael Yang committed
61
	gfxOverride := envconfig.HsaOverrideGfxVersion()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
64
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
65
66
67
			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
			slog.Warn(err.Error())
			return nil, err
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
72
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Michael Yang's avatar
Michael Yang committed
75
	for i := range count {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
77
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
81
82
83
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
86
87
88
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
90
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Michael Yang's avatar
lint  
Michael Yang committed
94
		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
		// TODO  Why isn't props.iGPU accurate!?

Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
101
102
			continue
		}

103
104
105
106
107
108
109
		gpuInfo := RocmGPUInfo{
			GpuInfo: GpuInfo{
				Library: "rocm",
				memInfo: memInfo{
					TotalMemory: totalMemory,
					FreeMemory:  freeMemory,
				},
110
111
112
				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
				UnreliableFreeMemory: true,

Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
114
115
116
117
				DependencyPath: libDir,
				MinimumMemory:  rocmMinimumMemory,
				Name:           name,
				Compute:        gfx,
118
119
				DriverMajor:    driverMajor,
				DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
			},
121
			index: i,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
122
123
		}

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
			reason := "unsupported Radeon iGPU detected skipping"
			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			continue
		}

		// Strip off Target Features when comparing
		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
				GpuInfo: gpuInfo.GpuInfo,
				Reason:  reason,
			})
			// HSA_OVERRIDE_GFX_VERSION not supported on windows
			continue
		} else {
			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
		}

		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))

Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154

155
	return resp, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
156
157
158
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162
163
	}

164
165
166
	// Installer payload (if we're running from some other location)
	localAppData := os.Getenv("LOCALAPPDATA")
	appDir := filepath.Join(localAppData, "Programs", "Ollama")
167
	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
168
	if rocmLibUsable(rocmTargetDir) {
169
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170
171
172
173
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
174
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Michael Yang's avatar
lint  
Michael Yang committed
175
	return "", errors.New("no suitable rocm found, falling back to CPU")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
176
}
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
	if len(gpus) == 0 {
		return nil
	}
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
		return nil
	}
	defer hl.Release()

	for i := range gpus {
		err := hl.HipSetDevice(gpus[i].index)
		if err != nil {
			return err
		}
		freeMemory, _, err := hl.HipMemGetInfo()
		if err != nil {
			slog.Warn("get mem info", "id", i, "error", err)
			continue
		}
		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
		gpus[i].FreeMemory = freeMemory
	}
	return nil
}