amd_windows.go 5.26 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
10
package gpu

import (
	"bytes"
	"fmt"
	"log/slog"
	"os"
	"path/filepath"
	"slices"
	"strings"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
12

	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
13
14
15
16
17
18
19
20
21
22
)

const (

	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
	iGPUName = "AMD Radeon(TM) Graphics"
)

var (
	// Used to validate if the given ROCm lib is usable
23
24
	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
)

Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
func AMDGetGPUInfo() []GpuInfo {
	resp := []GpuInfo{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
29
30
31
	hl, err := NewHipLib()
	if err != nil {
		slog.Debug(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
34
35
	}
	defer hl.Release()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
38
39
40
41
	// TODO - this reports incorrect version information, so omitting for now
	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
	// if err != nil {
	// 	// For now this is benign, but we may eventually need to fail compatibility checks
	// 	slog.Debug("error looking up amd driver version", "error", err)
	// }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42

Daniel Hiltgen's avatar
Daniel Hiltgen committed
43
	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
45
	count := hl.HipGetDeviceCount()
	if count == 0 {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
48
49
	}
	libDir, err := AMDValidateLibDir()
	if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
51
		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
		return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
54
55
56
57
58
	}

	var supported []string
	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
	if gfxOverride == "" {
		supported, err = GetSupportedGFX(libDir)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
60
			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			return nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
		}
	} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
65
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
	slog.Debug("detected hip devices", "count", count)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
70
	for i := 0; i < count; i++ {
		err = hl.HipSetDevice(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
			slog.Warn("set device", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
73
74
75
76
			continue
		}

		props, err := hl.HipGetDeviceProperties(i)
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
			slog.Warn("get properties", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
80
81
			continue
		}
		n := bytes.IndexByte(props.Name[:], 0)
		name := string(props.Name[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
83
		// TODO is UUID actually populated on windows?
		// Can luid be used on windows for setting visible devices (and is it actually set?)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
85
		n = bytes.IndexByte(props.GcnArchName[:], 0)
		gfx := string(props.GcnArchName[:n])
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
88
89
		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
		// TODO  Why isn't props.iGPU accurate!?
		if strings.EqualFold(name, iGPUName) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
93
94
			continue
		}
		if gfxOverride == "" {
			if !slices.Contains[[]string, string](supported, gfx) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
98
99
				// TODO - consider discrete markdown just for ROCM troubleshooting?
				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
				continue
			} else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
102
103
			}
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
		freeMemory, totalMemory, err := hl.HipMemGetInfo()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
105
		if err != nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
			slog.Warn("get mem info", "id", i, "error", err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
109
			continue
		}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
110
111
112
113
114
115
116
117
		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
		if totalMemory < IGPUMemLimit {
			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
			continue
		}

		// TODO revisit this once ROCm v6 is available on windows.
		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
Daniel Hiltgen's avatar
Daniel Hiltgen committed
118
119
		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
121
122
123
124
125
126
127
128
		gpuInfo := GpuInfo{
			Library: "rocm",
			memInfo: memInfo{
				TotalMemory: totalMemory,
				FreeMemory:  freeMemory,
			},
			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
			DependencyPath: libDir,
			MinimumMemory:  rocmMinimumMemory,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
129
130
131
132
133
134
			Name:           name,
			Compute:        gfx,

			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
			// DriverMajor:    driverMajor,
			// DriverMinor:    driverMinor,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
135
136
137
		}

		resp = append(resp, gpuInfo)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
138
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
139
140

	return resp
Daniel Hiltgen's avatar
Daniel Hiltgen committed
141
142
143
}

func AMDValidateLibDir() (string, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
	libDir, err := commonAMDValidateLibDir()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
	if err == nil {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		return libDir, nil
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
148
	}

149
150
151
152
	// Installer payload (if we're running from some other location)
	localAppData := os.Getenv("LOCALAPPDATA")
	appDir := filepath.Join(localAppData, "Programs", "Ollama")
	rocmTargetDir := filepath.Join(appDir, "rocm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
	if rocmLibUsable(rocmTargetDir) {
154
		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155
156
157
158
		return rocmTargetDir, nil
	}

	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
159
	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
161
	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}